aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2018-04-25 18:10:13 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:50:15 +0000
commit2b3129ebb9e4366e91de5031d1e1d3759cc42c8e (patch)
tree86b0d4f7870f8f548a68fae43cc32913d0d6dd9e
parent99d40951df87790fb884ce1c42d5e2a7a0009ee0 (diff)
downloadComputeLibrary-2b3129ebb9e4366e91de5031d1e1d3759cc42c8e.tar.gz
COMPMID-1041 NEON Winograd: update function to use GEMM function
Change-Id: I1ecdf10e02193de7f47a72b75cce0d58a1fa1a1c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/128411 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com>
-rw-r--r--arm_compute/runtime/NEON/functions/NEWinogradLayer.h7
-rw-r--r--src/runtime/NEON/functions/NEWinogradLayer.cpp88
2 files changed, 61 insertions, 34 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
index 27b1e84201..8010810253 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
@@ -27,6 +27,7 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
#include "arm_compute/runtime/MemoryGroup.h"
@@ -93,8 +94,9 @@ public:
NEWinogradLayer &operator=(const NEWinogradLayer &) = delete;
private:
- MemoryGroup _memory_group;
- std::unique_ptr<INEKernel> _batched_gemm_kernel;
+ MemoryGroup _memory_group;
+ std::unique_ptr<arm_gemm::GemmCommon<float, float>> _arm_gemm;
+ std::unique_ptr<INEKernel> _gemm_kernel;
std::unique_ptr<INEKernel> _transform_input_kernel;
std::unique_ptr<INEKernel> _transform_output_kernel;
std::unique_ptr<INEKernel> _transform_weights_kernel;
@@ -109,6 +111,7 @@ private:
Tensor _input_nhwc;
Tensor _output_nhwc;
Tensor _weights_hwio;
+ Tensor _workspace;
const ITensor *_input;
const ITensor *_weights;
ITensor *_output;
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index 264b97f7c1..7d93bcff07 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
@@ -79,9 +80,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
} //namespace
NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+ : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
_activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
- _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+ _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
{
} /* arm_compute */
@@ -95,27 +96,40 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
_input = input;
_output = output;
- std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
- switch(weights->info()->dimension(0))
+ const int weights_width = weights->info()->dimension(0);
+ const int weights_height = weights->info()->dimension(1);
+
+ int output_tile_rows = 0;
+ int output_tile_cols = 0;
+ int n_gemms = 0;
+ int N_BLOCK = 0; // Size of block used by GEMM.
+
+ switch(weights_width)
{
case 3:
{
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+ output_tile_rows = 2;
+ output_tile_cols = 2;
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
break;
}
case 5:
{
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+ output_tile_rows = 2;
+ output_tile_cols = 2;
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
break;
}
default:
@@ -170,8 +184,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
_input_nhwc.allocator()->allocate();
- const int weights_width = weights->info()->dimension(0);
- const int weights_height = weights->info()->dimension(1);
const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
// Configure the InputTransform
@@ -192,27 +204,41 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
- // Configure Batched GEMMs
- const int output_tile_rows = batched_gemm_kernel->get_output_tile_rows();
- const int output_tile_cols = batched_gemm_kernel->get_output_tile_cols();
- const int n_block = batched_gemm_kernel->get_number_blocks();
- const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
- const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int input_matrix_row_stride = in_shape.n_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, n_block);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
- const unsigned n_gemms = batched_gemm_kernel->get_number_gemms();
-
- batched_gemm_kernel->configure(n_gemms, m, k, n,
- input_matrix_stride, input_matrix_row_stride,
- kernel_matrix_stride, kernel_matrix_row_stride,
- output_matrix_stride, output_matrix_row_stride,
- reinterpret_cast<float *>(_input_workspace.buffer()),
- reinterpret_cast<float *>(_kernel_storage.buffer()),
- reinterpret_cast<float *>(_output_workspace.buffer()));
+ // Configure GEMM
+ const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
+ const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
+ const int m = in_shape.n_batches * tile_rows * tile_cols;
+ const int k = in_shape.n_channels;
+ const int n = out_channels;
+ const int input_matrix_row_stride = in_shape.n_channels;
+ const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+ const int output_matrix_row_stride = kernel_matrix_row_stride;
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+ _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+ kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+ auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+ acl_gemm_wrapper->configure(_arm_gemm.get());
+ const size_t workspace_size = _arm_gemm->get_working_size();
+
+ // Allocate workspace
+ if(workspace_size > 0)
+ {
+ const unsigned int alignment = 4096;
+ allocate_workspace(workspace_size, _workspace, _memory_group, alignment, 1);
+ _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+ }
+
+ const unsigned int window_size = _arm_gemm->get_window_size();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _arm_gemm->set_nthreads(num_threads);
+ }
+
+ _gemm_kernel = std::move(acl_gemm_wrapper);
// Reorder the convoluted output to ACL's ordering NCHW
_permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
@@ -220,7 +246,6 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co
_transform_input_kernel = std::move(transform_input_kernel);
_transform_weights_kernel = std::move(transform_weights_kernel);
_transform_output_kernel = std::move(transform_output_kernel);
- _batched_gemm_kernel = std::move(batched_gemm_kernel);
//Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
@@ -246,7 +271,7 @@ void NEWinogradLayer::run()
NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
+ NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
// Transform output tensor to the spatial domain
NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
@@ -258,7 +283,6 @@ void NEWinogradLayer::run()
{
_activationlayer_function.run();
}
-
_memory_group.release();
}