aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/NEON/functions/assembly
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2018-11-30 10:42:40 +0000
committerAnthony Barbier <Anthony.barbier@arm.com>2018-12-11 13:55:49 +0000
commitff0bccfb4697c591d569db9c2dc223f2e311a7d3 (patch)
tree3db31636df5d8568fea1db1d275f45e3d63a70d3 /src/runtime/NEON/functions/assembly
parent5ba5e0938e68d4f90f5545a81066d56f022b376a (diff)
downloadComputeLibrary-ff0bccfb4697c591d569db9c2dc223f2e311a7d3.tar.gz
COMPMID-1497: Add support for interleaved B reshaping in gemm_interleaved
Change-Id: I2171e1bf707bdcfa221c18d7a8904979e110020d Reviewed-on: https://review.mlplatform.org/326 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez <pablo.tello@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/runtime/NEON/functions/assembly')
-rw-r--r--src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp355
1 files changed, 300 insertions, 55 deletions
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index dcb2f856f0..fe998a0e42 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -32,8 +32,149 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
namespace arm_compute
{
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+ /** Number of buffers to ping pong between */
+ static constexpr unsigned int NUM_BUFFERS = 3;
+
+ explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+ : _max_num_users(max_num_users)
+ {
+ }
+ unsigned int num_buffers() const override
+ {
+ return NUM_BUFFERS;
+ }
+ /* - Lock the requested index if it's free and return true if it needs reshaping.
+ * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+ * - Block if the corresponding buffer for the given index is still being used by a different index.
+ */
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ while(true)
+ {
+ if(buf.index == index && buf.state != State::FREE)
+ {
+ //Another thread already is reshaping / has reshaped this block: nothing to do
+ return false;
+ }
+ else
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //If the buffer is free then lock it for reshaping:
+ if(buf.state == State::FREE)
+ {
+ buf.index = index;
+ buf.state = State::BEING_RESHAPED;
+ return true;
+ }
+ // Check again just in case it changed while we were acquiring the lock:
+ if(buf.index == index)
+ {
+ //Another thread is reshaping this block already, nothing to do
+ return false;
+ }
+ // buf.index != index: Buffer still being used by another block, need to wait
+ buf.sem.wait(lock);
+ }
+ }
+ }
+ /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+ void mark_as_reshaped(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ {
+ std::lock_guard<std::mutex> lock(buf.mutex);
+ buf.users = _max_num_users;
+ buf.state = State::IN_USE;
+ }
+ buf.sem.notify_all();
+ }
+
+ /* Block until the buffer at the given index is reshaped */
+ void wait_for_reshaping(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ // Check if it's already ready to use:
+ if(buf.state == State::IN_USE)
+ return;
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //Double check it didn't change while we were acquiring the lock:
+ if(buf.state == State::IN_USE)
+ return;
+ buf.sem.wait(lock);
+ }
+ /* Mark the buffer at the given index as not used by this thread anymore.
+ * Once all the threads have called this method then the buffer is marked as free again.
+ */
+ void mark_as_unused(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ if(--buf.users == 0)
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ buf.state = State::FREE;
+ lock.unlock();
+ buf.sem.notify_all();
+ }
+ }
+
+private:
+ enum class State
+ {
+ FREE,
+ BEING_RESHAPED,
+ IN_USE
+ };
+ struct Buffer
+ {
+ unsigned int index{};
+ std::atomic_uint users{};
+ State state{ State::FREE };
+ std::mutex mutex{};
+ std::condition_variable sem{};
+ } _buffers[NUM_BUFFERS];
+ Buffer &get_buffer_from_index(unsigned int index)
+ {
+ return _buffers[index % NUM_BUFFERS];
+ }
+ unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+ unsigned int num_buffers() const override
+ {
+ return 1;
+ }
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ return true;
+ }
+ void mark_as_reshaped(unsigned int index) override
+ {
+ }
+ void wait_for_reshaping(unsigned int index) override
+ {
+ }
+ void mark_as_unused(unsigned int index) override
+ {
+ }
+};
+
NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager))
{
@@ -72,6 +213,7 @@ void NEGEMMInterleavedWrapper::prepare()
// Keep the smallest of the two:
const unsigned int num_windows = std::min(num_iterations, max_iterations);
const TensorShape window_shape = _batch_window.shape();
+ const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
// Create a 1D window to dynamically split the batch window:
Window win_1D;
@@ -80,28 +222,112 @@ void NEGEMMInterleavedWrapper::prepare()
// Create one workload for each sub-window:
for(unsigned int w = 0; w < num_windows; w++)
{
- Window win = win_1D.split_window(0, w, num_windows);
- const Coordinates start_offset = index2coords(window_shape, win.x().start());
- const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+ Window win = win_1D.split_window(0, w, num_windows);
+ const Coordinates start_offset = index2coords(window_shape, win.x().start());
+ const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ if(_pretranspose_b)
{
- //For each block of rows in "M"
- auto workload_mm = this->_mm_workloads.begin();
- for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
{
- // Transform one k_block from A:
- this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
- // Then perform the matrix multiplication for each x block along N:
- for(unsigned int i = 0; i < num_x_blocks; i++)
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
{
- ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
- this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ }
}
- }
- };
- _workloads.push_back(workload);
+ };
+ _workloads.push_back(workload);
+ }
+ else
+ {
+ auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ {
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ {
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ }
+ };
+ _workloads.push_back(workload);
+ }
+ }
+ if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+ {
+ //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+ for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+ {
+ auto workload = [this](const ThreadInfo & info)
+ {
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = 1;
+
+ for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+ {
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ };
+ _workloads.push_back(workload);
+ }
}
_is_prepared = true;
@@ -158,62 +384,81 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe
if(!_pretranspose_b)
{
+ _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+ // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+ const unsigned int num_iterations = _batch_window.num_iterations_total();
+ if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+ {
+ _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+ }
+ else
+ {
+#ifdef NO_MULTI_THREADING
+ ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else /* NO_MULTI_THREADING */
+ _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+ }
// If B is transposed at every iteration then transformed_B can be managed:
_memory_group.manage(&_transformed_b);
- _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+ auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
}
else
{
_tag += "_preB";
- switch(input_type)
- {
- case DataType::F32:
- _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
- break;
+ }
+ switch(input_type)
+ {
+ case DataType::F32:
+ _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+ break;
#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
- }
- break;
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
+ case DataType::S8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
#endif /* __aarch64__ */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
- break;
+ case DataType::F16:
+ _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+ break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
- ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+ if(_pretranspose_b)
+ {
_block_sizes = _prepare_b->block_sizes();
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
}
_block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
_block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
_block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
- _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
- _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
_transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
_memory_group.manage(&_transformed_a);
_memory_group.manage(&_tmp_c);