diff options
author | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-30 10:42:40 +0000 |
---|---|---|
committer | Anthony Barbier <Anthony.barbier@arm.com> | 2018-12-11 13:55:49 +0000 |
commit | ff0bccfb4697c591d569db9c2dc223f2e311a7d3 (patch) | |
tree | 3db31636df5d8568fea1db1d275f45e3d63a70d3 /src | |
parent | 5ba5e0938e68d4f90f5545a81066d56f022b376a (diff) | |
download | ComputeLibrary-ff0bccfb4697c591d569db9c2dc223f2e311a7d3.tar.gz |
COMPMID-1497: Add support for interleaved B reshaping in gemm_interleaved
Change-Id: I2171e1bf707bdcfa221c18d7a8904979e110020d
Reviewed-on: https://review.mlplatform.org/326
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src')
3 files changed, 333 insertions, 61 deletions
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp index 2c9cd320f0..3b2975dd80 100644 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp +++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp @@ -101,6 +101,14 @@ void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_wor using strategy = typename Kernel<To, use_dot>::strategy; unsigned int offset_transformed_b = 0; + unsigned int wl_index = 0; + unsigned int num_buffers = 0, reshaped_block_size = 0; + + if(!_b_is_pretransposed) + { + num_buffers = _transformed_b->info()->tensor_shape()[1]; + reshaped_block_size = _transformed_b->info()->tensor_shape()[0]; + } execute_window_loop(_block_walker, [&](const Coordinates & id) { const unsigned int x0 = id.x(); @@ -122,7 +130,9 @@ void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_wor } else { - ARM_COMPUTE_ERROR("Not supported"); + // Rotate through the BufferManager's buffers: + wl_index++; + offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; } }); } diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp index 41a031c1c7..7fc57f3c02 100644 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp +++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp @@ -35,10 +35,18 @@ namespace arm_compute namespace { // Call the lambda function for each workload generated by the passed window. -template <typename To, bool use_dot, typename Lambda> +template <typename To, bool use_dot, bool use_buffer_manager, typename Lambda> void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda) { - using strategy = typename Kernel<To, use_dot>::strategy; + using strategy = typename Kernel<To, use_dot>::strategy; + unsigned int wl_index = 0; + unsigned int num_buffers = 0, reshaped_block_size = 0; + + if(use_buffer_manager) + { + num_buffers = transformed_b->info()->tensor_shape()[1]; + reshaped_block_size = transformed_b->info()->strides_in_bytes().y(); + } unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes(); execute_window_loop(window, [&](const Coordinates & coordinates) @@ -62,7 +70,16 @@ void for_each_element_in_window(const Window &window, const ITensor *b, ITensor lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax)); //Each workload represents one block: - offset_transformed_b += (x_size * k_size * sizeof(To)); + if(use_buffer_manager) + { + // Rotate through the BufferManager's buffers: + wl_index++; + offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; + } + else + { + offset_transformed_b += (x_size * k_size * sizeof(To)); + } }); } @@ -142,7 +159,7 @@ void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::transform(cons template <typename To, bool use_dot> void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::create_workloads(std::vector<PrepareBWorkload> &workloads) { - for_each_element_in_window<To, use_dot>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl) + for_each_element_in_window<To, use_dot, true>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl) { workloads.push_back(std::move(wl)); }); @@ -152,7 +169,7 @@ template <typename To, bool use_dot> void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window()); - for_each_element_in_window<To, use_dot>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl) + for_each_element_in_window<To, use_dot, false>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl) { this->transform(wl, info); }); diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp index dcb2f856f0..fe998a0e42 100644 --- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp +++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp @@ -32,8 +32,149 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include <atomic> +#include <condition_variable> +#include <mutex> + namespace arm_compute { +#ifndef NO_MULTI_THREADING +class BufferManagerMultipleThreads final : public IBufferManager +{ +public: + /** Number of buffers to ping pong between */ + static constexpr unsigned int NUM_BUFFERS = 3; + + explicit BufferManagerMultipleThreads(unsigned int max_num_users) + : _max_num_users(max_num_users) + { + } + unsigned int num_buffers() const override + { + return NUM_BUFFERS; + } + /* - Lock the requested index if it's free and return true if it needs reshaping. + * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped. + * - Block if the corresponding buffer for the given index is still being used by a different index. + */ + bool lock_to_reshape_if_needed(unsigned int index) override + { + Buffer &buf = get_buffer_from_index(index); + while(true) + { + if(buf.index == index && buf.state != State::FREE) + { + //Another thread already is reshaping / has reshaped this block: nothing to do + return false; + } + else + { + std::unique_lock<std::mutex> lock(buf.mutex); + //If the buffer is free then lock it for reshaping: + if(buf.state == State::FREE) + { + buf.index = index; + buf.state = State::BEING_RESHAPED; + return true; + } + // Check again just in case it changed while we were acquiring the lock: + if(buf.index == index) + { + //Another thread is reshaping this block already, nothing to do + return false; + } + // buf.index != index: Buffer still being used by another block, need to wait + buf.sem.wait(lock); + } + } + } + /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */ + void mark_as_reshaped(unsigned int index) override + { + Buffer &buf = get_buffer_from_index(index); + { + std::lock_guard<std::mutex> lock(buf.mutex); + buf.users = _max_num_users; + buf.state = State::IN_USE; + } + buf.sem.notify_all(); + } + + /* Block until the buffer at the given index is reshaped */ + void wait_for_reshaping(unsigned int index) override + { + Buffer &buf = get_buffer_from_index(index); + ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed() + // Check if it's already ready to use: + if(buf.state == State::IN_USE) + return; + std::unique_lock<std::mutex> lock(buf.mutex); + //Double check it didn't change while we were acquiring the lock: + if(buf.state == State::IN_USE) + return; + buf.sem.wait(lock); + } + /* Mark the buffer at the given index as not used by this thread anymore. + * Once all the threads have called this method then the buffer is marked as free again. + */ + void mark_as_unused(unsigned int index) override + { + Buffer &buf = get_buffer_from_index(index); + ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed() + if(--buf.users == 0) + { + std::unique_lock<std::mutex> lock(buf.mutex); + buf.state = State::FREE; + lock.unlock(); + buf.sem.notify_all(); + } + } + +private: + enum class State + { + FREE, + BEING_RESHAPED, + IN_USE + }; + struct Buffer + { + unsigned int index{}; + std::atomic_uint users{}; + State state{ State::FREE }; + std::mutex mutex{}; + std::condition_variable sem{}; + } _buffers[NUM_BUFFERS]; + Buffer &get_buffer_from_index(unsigned int index) + { + return _buffers[index % NUM_BUFFERS]; + } + unsigned int _max_num_users; +}; +#endif /* NO_MULTI_THREADING */ + +class BufferManagerSingleThread : public IBufferManager +{ +public: + unsigned int num_buffers() const override + { + return 1; + } + bool lock_to_reshape_if_needed(unsigned int index) override + { + return true; + } + void mark_as_reshaped(unsigned int index) override + { + } + void wait_for_reshaping(unsigned int index) override + { + } + void mark_as_unused(unsigned int index) override + { + } +}; + NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)) { @@ -72,6 +213,7 @@ void NEGEMMInterleavedWrapper::prepare() // Keep the smallest of the two: const unsigned int num_windows = std::min(num_iterations, max_iterations); const TensorShape window_shape = _batch_window.shape(); + const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX); // Create a 1D window to dynamically split the batch window: Window win_1D; @@ -80,28 +222,112 @@ void NEGEMMInterleavedWrapper::prepare() // Create one workload for each sub-window: for(unsigned int w = 0; w < num_windows; w++) { - Window win = win_1D.split_window(0, w, num_windows); - const Coordinates start_offset = index2coords(window_shape, win.x().start()); - const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1); - const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX); + Window win = win_1D.split_window(0, w, num_windows); + const Coordinates start_offset = index2coords(window_shape, win.x().start()); + const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1); - auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info) + if(_pretranspose_b) { - //For each block of rows in "M" - auto workload_mm = this->_mm_workloads.begin(); - for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++) + auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info) { - // Transform one k_block from A: - this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset); - // Then perform the matrix multiplication for each x block along N: - for(unsigned int i = 0; i < num_x_blocks; i++) + //For each block of rows in "M" + auto workload_mm = this->_mm_workloads.begin(); + for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++) { - ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end()); - this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset); + // Transform one k_block from A: + this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset); + // Then perform the matrix multiplication for each x block along N: + for(unsigned int i = 0; i < num_x_blocks; i++) + { + ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end()); + this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset); + } } - } - }; - _workloads.push_back(workload); + }; + _workloads.push_back(workload); + } + else + { + auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info) + { + //For each block of rows in "M" + auto workload_mm = this->_mm_workloads.begin(); + unsigned int workload_b = 0; + //If there is only one thread then only reshape the B blocks as you need them: + unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1; + + for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++) + { + // Transform one k_block from A: + this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset); + // Then perform the matrix multiplication for each x block along N: + for(unsigned int i = 0; i < num_x_blocks; i++) + { + ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end()); + if(workload_b_next < this->_b_workloads.size()) + { + //Lock on BufferManager: need to run it ? + if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next)) + { + this->_prepare_b->transform(this->_b_workloads[workload_b_next], info); + this->_buffer_manager->mark_as_reshaped(workload_b_next); + } + workload_b_next++; + } + ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size()); + // Run if needed or wait + if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b)) + { + this->_prepare_b->transform(this->_b_workloads[workload_b], info); + this->_buffer_manager->mark_as_reshaped(workload_b); + } + this->_buffer_manager->wait_for_reshaping(workload_b); + this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset); + this->_buffer_manager->mark_as_unused(workload_b); + workload_b++; + } + } + }; + _workloads.push_back(workload); + } + } + if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0) + { + //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks: + for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++) + { + auto workload = [this](const ThreadInfo & info) + { + unsigned int workload_b = 0; + //If there is only one thread then only reshape the B blocks as you need them: + unsigned int workload_b_next = 1; + + for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++) + { + if(workload_b_next < this->_b_workloads.size()) + { + //Lock on BufferManager: need to run it ? + if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next)) + { + this->_prepare_b->transform(this->_b_workloads[workload_b_next], info); + this->_buffer_manager->mark_as_reshaped(workload_b_next); + } + workload_b_next++; + } + ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size()); + // Run if needed or wait + if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b)) + { + this->_prepare_b->transform(this->_b_workloads[workload_b], info); + this->_buffer_manager->mark_as_reshaped(workload_b); + } + this->_buffer_manager->wait_for_reshaping(workload_b); + this->_buffer_manager->mark_as_unused(workload_b); + workload_b++; + } + }; + _workloads.push_back(workload); + } } _is_prepared = true; @@ -158,62 +384,81 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe if(!_pretranspose_b) { + _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot); + _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height)); + _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches)); + // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them: + const unsigned int num_iterations = _batch_window.num_iterations_total(); + if(NEScheduler::get().num_threads() == 1 || num_iterations == 1) + { + _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>(); + } + else + { +#ifdef NO_MULTI_THREADING + ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads"); +#else /* NO_MULTI_THREADING */ + _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads()); +#endif /* NO_MULTI_THREADING */ + } // If B is transposed at every iteration then transformed_B can be managed: _memory_group.manage(&_transformed_b); - _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot); + auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers()))); } else { _tag += "_preB"; - switch(input_type) - { - case DataType::F32: - _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params); - break; + } + switch(input_type) + { + case DataType::F32: + _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params); + break; #ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(use_dot) - { - _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params); - } - else - { - _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params); - } - break; - case DataType::S8: - if(use_dot) - { - _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params); - } - else - { - _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params); - } - break; + case DataType::U8: + case DataType::QASYMM8: + if(use_dot) + { + _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params); + } + else + { + _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params); + } + break; + case DataType::S8: + if(use_dot) + { + _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params); + } + else + { + _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params); + } + break; #endif /* __aarch64__ */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params); - break; + case DataType::F16: + _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params); + break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } - ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr); + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr); + if(_pretranspose_b) + { _block_sizes = _prepare_b->block_sizes(); + _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height)); + _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches)); } _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block)); _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block)); _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis)); - _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height)); - _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches)); - _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment); _memory_group.manage(&_transformed_a); _memory_group.manage(&_tmp_c); |