From 3d677ccee046cd384abf2142f323f8e9e7a4834f Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 23 Jul 2018 16:42:59 +0100 Subject: COMPMID-1406: Refactor gemm_interleaved to use our own types and scheduler - Ported PrepareB kernel from gemm_interleave - Ported TransformA feature from gemm_interleave - Allocate reshaped a and b buffers - Added memory_manager / memory_group - MatrixMultiply kernel - Interleave kernels execution. - Fixed a few bugs: all nightly Convolution tests passing for threads=1 and threads=4 - Added Doxygen documentations and comments in the code - Added support for all data types supported Change-Id: Iffa1c09fda0bb9c61213bb83524d5a48e7ecb03c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141281 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- .../assembly/NEGEMMInterleavedWrapper.cpp | 260 +++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp (limited to 'src/runtime/NEON/functions/assembly') diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp new file mode 100644 index 0000000000..434723ca1a --- /dev/null +++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)) +{ +} +void NEGEMMInterleavedWrapper::run() +{ + prepare(); + + _memory_group.acquire(); + NEScheduler::get().run_workloads(_workloads); + _memory_group.release(); +} + +void NEGEMMInterleavedWrapper::prepare() +{ + if(!_is_prepared) + { + if(_pretranspose_b) + { + NEScheduler::get().schedule(_prepare_b.get(), Window::DimX); + _b->mark_as_unused(); + } + else + { + _prepare_b->create_workloads(_b_workloads); + } + _transform_a->create_workloads(_a_workloads); + _matrix_multiply->create_workloads(_mm_workloads); + + //Maximum number of workloads to create: + const unsigned int num_threads = NEScheduler::get().num_threads(); + const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads * 4; + //Maximum number of iterations the parameters allow: + const unsigned int num_iterations = _batch_window.num_iterations_total(); + // Keep the smallest of the two: + const unsigned int num_windows = std::min(num_iterations, max_iterations); + const TensorShape window_shape = _batch_window.shape(); + + // Create a 1D window to dynamically split the batch window: + Window win_1D; + win_1D.set(0, Window::Dimension(0, num_iterations)); + + // Create one workload for each sub-window: + for(unsigned int w = 0; w < num_windows; w++) + { + Window win = win_1D.split_window(0, w, num_windows); + const Coordinates start_offset = index2coords(window_shape, win.x().start()); + const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1); + const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX); + + auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info) + { + //For each block of rows in "M" + auto workload_mm = this->_mm_workloads.begin(); + for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++) + { + // Transform one k_block from A: + this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset); + // Then perform the matrix multiplication for each x block along N: + for(unsigned int i = 0; i < num_x_blocks; i++) + { + ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end()); + this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset); + } + } + }; + _workloads.push_back(workload); + } + + _is_prepared = true; + } +} + +namespace +{ +// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel: +template +std::unique_ptr instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params ¶ms) +{ + auto prepare_b = support::cpp14::make_unique>(); + prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params); + return std::move(prepare_b); +} + +// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate: +template +std::unique_ptr instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms) +{ + auto transform_a = support::cpp14::make_unique>(); + transform_a->configure(a, transformed_a, false, block_walker, params); + return std::move(transform_a); +} + +// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate: +template +std::unique_ptr instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, + const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool pretranspose_b, float alpha, float beta) +{ + auto matrix_multiply = support::cpp14::make_unique>(); + matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads()); + return std::move(matrix_multiply); +} +} // namespace + +void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot) +{ + _params = INEGEMMWrapperKernel::extract_parameters(a, b, c); + _a = a; + _b = b; + _c = c; + _pretranspose_b = pretranspose_b; + + DataType input_type = a->info()->data_type(); + + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + _transformed_b.allocator()->init(TensorInfo{}, alignment); + _tmp_c.allocator()->init(TensorInfo{}, alignment); + if(!_pretranspose_b) + { + // If B is transposed at every iteration then transformed_B can be managed: + _memory_group.manage(&_transformed_b); + } + switch(input_type) + { + case DataType::F32: + _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if(use_dot) + { + _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); + } + else + { + _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); + } + break; + case DataType::S8: + if(use_dot) + { + _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); + } + else + { + _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); + } + break; +#endif /* __aarch64__ */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr); + + _block_sizes = _prepare_b->block_sizes(); + + _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block)); + _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block)); + _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis)); + + _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height)); + _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches)); + + _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment); + _memory_group.manage(&_transformed_a); + _memory_group.manage(&_tmp_c); + + switch(input_type) + { + case DataType::F32: + _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if(use_dot) + { + _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + } + else + { + _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + } + break; + case DataType::S8: + if(use_dot) + { + _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + } + else + { + _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + } + break; +#endif /* __aarch64__ */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); + break; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + break; + } + ARM_COMPUTE_ERROR_ON(_transform_a == nullptr); + ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr); + _transformed_a.allocator()->allocate(); + _tmp_c.allocator()->allocate(); + _transformed_b.allocator()->allocate(); +} +} // namespace arm_compute -- cgit v1.2.1