From 2b3129ebb9e4366e91de5031d1e1d3759cc42c8e Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Wed, 25 Apr 2018 18:10:13 +0100 Subject: COMPMID-1041 NEON Winograd: update function to use GEMM function Change-Id: I1ecdf10e02193de7f47a72b75cce0d58a1fa1a1c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/128411 Tested-by: Jenkins Reviewed-by: Pablo Tello --- .../runtime/NEON/functions/NEWinogradLayer.h | 7 +- src/runtime/NEON/functions/NEWinogradLayer.cpp | 88 ++++++++++++++-------- 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h index 27b1e84201..8010810253 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPPermute.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -93,8 +94,9 @@ public: NEWinogradLayer &operator=(const NEWinogradLayer &) = delete; private: - MemoryGroup _memory_group; - std::unique_ptr _batched_gemm_kernel; + MemoryGroup _memory_group; + std::unique_ptr> _arm_gemm; + std::unique_ptr _gemm_kernel; std::unique_ptr _transform_input_kernel; std::unique_ptr _transform_output_kernel; std::unique_ptr _transform_weights_kernel; @@ -109,6 +111,7 @@ private: Tensor _input_nhwc; Tensor _output_nhwc; Tensor _weights_hwio; + Tensor _workspace; const ITensor *_input; const ITensor *_weights; ITensor *_output; diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index 264b97f7c1..7d93bcff07 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "support/ToolchainSupport.h" @@ -79,9 +80,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } //namespace NEWinogradLayer::NEWinogradLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), + : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), - _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false) + _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false) { } /* arm_compute */ @@ -95,27 +96,40 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _input = input; _output = output; - std::unique_ptr> batched_gemm_kernel; std::unique_ptr> transform_input_kernel; std::unique_ptr> transform_weights_kernel; std::unique_ptr> transform_output_kernel; - switch(weights->info()->dimension(0)) + const int weights_width = weights->info()->dimension(0); + const int weights_height = weights->info()->dimension(1); + + int output_tile_rows = 0; + int output_tile_cols = 0; + int n_gemms = 0; + int N_BLOCK = 0; // Size of block used by GEMM. + + switch(weights_width) { case 3: { - batched_gemm_kernel = support::cpp14::make_unique>(); transform_input_kernel = support::cpp14::make_unique>(); transform_weights_kernel = support::cpp14::make_unique>(); transform_output_kernel = support::cpp14::make_unique>(); + output_tile_rows = 2; + output_tile_cols = 2; + n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; + N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; break; } case 5: { - batched_gemm_kernel = support::cpp14::make_unique>(); transform_input_kernel = support::cpp14::make_unique>(); transform_weights_kernel = support::cpp14::make_unique>(); transform_output_kernel = support::cpp14::make_unique>(); + output_tile_rows = 2; + output_tile_cols = 2; + n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; + N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; break; } default: @@ -170,8 +184,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); - const int weights_width = weights->info()->dimension(0); - const int weights_height = weights->info()->dimension(1); const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels }); // Configure the InputTransform @@ -192,27 +204,41 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co output_matrix_stride, reinterpret_cast(_output_nhwc.buffer()), in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); - // Configure Batched GEMMs - const int output_tile_rows = batched_gemm_kernel->get_output_tile_rows(); - const int output_tile_cols = batched_gemm_kernel->get_output_tile_cols(); - const int n_block = batched_gemm_kernel->get_number_blocks(); - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); - const int m = in_shape.n_batches * tile_rows * tile_cols; - const int k = in_shape.n_channels; - const int n = out_channels; - const int input_matrix_row_stride = in_shape.n_channels; - const int kernel_matrix_row_stride = roundup(out_channels, n_block); - const int output_matrix_row_stride = kernel_matrix_row_stride; - const unsigned n_gemms = batched_gemm_kernel->get_number_gemms(); - - batched_gemm_kernel->configure(n_gemms, m, k, n, - input_matrix_stride, input_matrix_row_stride, - kernel_matrix_stride, kernel_matrix_row_stride, - output_matrix_stride, output_matrix_row_stride, - reinterpret_cast(_input_workspace.buffer()), - reinterpret_cast(_kernel_storage.buffer()), - reinterpret_cast(_output_workspace.buffer())); + // Configure GEMM + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int m = in_shape.n_batches * tile_rows * tile_cols; + const int k = in_shape.n_channels; + const int n = out_channels; + const int input_matrix_row_stride = in_shape.n_channels; + const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK); + const int output_matrix_row_stride = kernel_matrix_row_stride; + unsigned int num_threads = NEScheduler::get().num_threads(); + + _arm_gemm = arm_gemm::gemm(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false); + _arm_gemm->set_arrays(reinterpret_cast(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast(_kernel_storage.buffer()), + kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride); + + auto acl_gemm_wrapper = support::cpp14::make_unique>>(); + acl_gemm_wrapper->configure(_arm_gemm.get()); + const size_t workspace_size = _arm_gemm->get_working_size(); + + // Allocate workspace + if(workspace_size > 0) + { + const unsigned int alignment = 4096; + allocate_workspace(workspace_size, _workspace, _memory_group, alignment, 1); + _arm_gemm->set_working_space(reinterpret_cast(_workspace.buffer())); + } + + const unsigned int window_size = _arm_gemm->get_window_size(); + if(window_size < num_threads) + { + num_threads = window_size; + _arm_gemm->set_nthreads(num_threads); + } + + _gemm_kernel = std::move(acl_gemm_wrapper); // Reorder the convoluted output to ACL's ordering NCHW _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); @@ -220,7 +246,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _transform_input_kernel = std::move(transform_input_kernel); _transform_weights_kernel = std::move(transform_weights_kernel); _transform_output_kernel = std::move(transform_output_kernel); - _batched_gemm_kernel = std::move(batched_gemm_kernel); //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); @@ -246,7 +271,7 @@ void NEWinogradLayer::run() NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX); //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX); + NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX); // Transform output tensor to the spatial domain NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); @@ -258,7 +283,6 @@ void NEWinogradLayer::run() { _activationlayer_function.run(); } - _memory_group.release(); } -- cgit v1.2.1