diff options
author | Pablo Tello <pablo.tello@arm.com> | 2019-10-21 14:25:41 +0100 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2019-11-08 12:07:21 +0000 |
commit | 5264b7d5555ec980f9c52c719122479d0d676af8 (patch) | |
tree | 78260be4ee31d89d00705acbf1e0ed2361144bd4 /arm_compute/core/NEON/kernels/convolution/winograd | |
parent | 68adf4449b1f92dd2362d88bb0fd565c2c06d22c (diff) | |
download | ComputeLibrary-5264b7d5555ec980f9c52c719122479d0d676af8.tar.gz |
COMPMID-2576: Fuse activation in Winograd output transform.
Change-Id: I26dd1307847adeaaefae0a7374b9858c07d71372
Signed-off-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2172
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels/convolution/winograd')
-rw-r--r-- | arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp | 105 | ||||
-rw-r--r-- | arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp | 36 |
2 files changed, 74 insertions, 67 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp index 183c9c1061..bc0d9d4296 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,10 @@ #pragma once -#include "convolution.hpp" -#include "tensor.hpp" -#include "utils.hpp" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" + +#include <cstddef> +#include <utility> namespace winograd { @@ -308,7 +309,8 @@ class OutputTransform : public IOutputTransform int n_batches, /**< Number of batches in output tensor. */ int n_rows, /**< Number of rows in output tensor. */ int n_cols, /**< Number of columns in output tensor. */ - int n_channels /**< Number of channels in output tensor. */ + int n_channels, /**< Number of channels in output tensor. */ + const arm_gemm::Activation &activation ); OutputTransform(OutputTransform&) = delete; @@ -344,6 +346,7 @@ class OutputTransform : public IOutputTransform static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1; const int _n_batches, _n_rows, _n_cols, _n_channels; + const TOut _output_min, _output_max; private: void transform_uncropped_tile( @@ -372,7 +375,9 @@ class OutputTransform : public IOutputTransform const TOut* biases, TOut* output, int output_row_stride, - int output_col_stride + int output_col_stride, + TOut output_min, + TOut output_max ); /** Get the working space for a thread. */ @@ -405,7 +410,8 @@ class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> : int n_batches, /**< Number of batches in output tensor. */ int n_rows, /**< Number of rows in output tensor. */ int n_cols, /**< Number of columns in output tensor. */ - int n_channels /**< Number of channels in output tensor. */ + int n_channels, /**< Number of channels in output tensor. */ + const arm_gemm::Activation &activation ); /** Set pointers to the output tensor written by the transform. */ @@ -528,79 +534,84 @@ class WinogradGEMM typedef TIn InputType; /** Get the output shape of a convolution. */ - static Tensor4DShape get_output_shape( - const KernelShape &kernel_shape, - const Tensor4DShape &in_shape, - const PaddingType padding - ); - - /* Get the memory required to transform the kernel. - */ - static size_t get_kernel_transform_working_size(const KernelShape &shape); + static std::pair<unsigned int, unsigned int> get_output_shape( + const std::pair<unsigned int, unsigned int> input_shape, + bool padding_same); /** Get the memory required to store the kernel transformed into the * Winograd domain. */ - static size_t get_kernel_storage_size(const KernelShape &shape); + static size_t get_kernel_storage_size(unsigned int n_input_channels, + unsigned int n_output_channels); /** Get the memory required to store the input tensor transformed into * the Winograd domain. */ static size_t get_input_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); /** Get the memory required to store the output tensor in the Winograd * domain. */ static size_t get_output_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); /** Get the memory required to apply a Winograd operator to some input. */ static size_t get_working_space_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_input_channels, // Number of input channels + unsigned int n_output_channels, // Number of output channels + bool padding_same); /* Get the memory required by a single "input" matrix. */ static size_t get_input_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); static int get_input_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); /* Get the memory required by a single "output" matrix. */ static size_t get_output_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); static int get_output_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); /* Get the memory required by a single "kernel" matrix. */ - static size_t get_kernel_matrix_size(const KernelShape &shape); - static int get_kernel_matrix_stride(const KernelShape &shape); + static size_t get_kernel_matrix_size(unsigned int n_input_channels, + unsigned int n_output_channels); + static int get_kernel_matrix_stride(unsigned int n_input_channels, + unsigned int n_output_channels); static constexpr int M_BLOCK = 4; /** Size of block used by GEMM. */ static constexpr int N_BLOCK = 16; /** Size of block used by GEMM. */ diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp index 9d418bebb4..ed8fede385 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,9 +23,6 @@ */ #pragma once - -#include <utility> - #include "arm_gemm_local.hpp" #include "arm_gemm.hpp" #include "winograd.hpp" @@ -42,8 +39,8 @@ class IWinogradConvolutionLayer virtual unsigned int weight_transform_get_window(void) const = 0; virtual void weight_transform_run(unsigned int start, unsigned int stop) = 0; - virtual ITransform& input_transform(void) = 0; // Expose the input transform - virtual ITransform& output_transform(void) = 0; // Expose the output transform + virtual IInputTransform& input_transform(void) = 0; // Expose the input transform + virtual IOutputTransform& output_transform(void) = 0; // Expose the output transform virtual arm_gemm::IGemmCommon *gemm(void) = 0; // Expose the underlying GEMM }; @@ -65,15 +62,18 @@ template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols WinogradRoots Roots> class WinogradConvolutionLayer : public IWinogradConvolutionLayer { + public: + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, Roots>; + using WeightsTransform = typename WinogradBase::template WeightsTransform<TIn, TInGEMM>; + using InputTransform = typename WinogradBase::template InputTransform<TIn, TInGEMM>; + using WinogradConv = typename WinogradBase::template Convolution<TOut, TIn, TInGEMM, TOutGEMM>; + using OutputTransform = typename WinogradBase::template OutputTransform<TOutGEMM, TOut>; + private: static constexpr int InnerTileRows = OutputTileRows + KernelRows - 1; static constexpr int InnerTileCols = OutputTileCols + KernelCols - 1; static constexpr int N_GEMMS = InnerTileRows * InnerTileCols; - const KernelShape _kernel_shape; - const Tensor4DShape _input_shape; - const PaddingType _padding; - const Tensor4DShape _output_shape; const int _n_output_rows, _n_output_cols; const int _kernel_matrix_stride, _kernel_matrix_row_stride; const int _input_matrix_stride, _input_matrix_row_stride; @@ -81,19 +81,14 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer const int _tile_rows, _tile_cols; const int _m, _k, _n; - public: - using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, Roots>; - using WeightsTransform = typename WinogradBase::template WeightsTransform<TIn, TInGEMM>; - using InputTransform = typename WinogradBase::template InputTransform<TIn, TInGEMM>; - using WinogradConv = typename WinogradBase::template Convolution<TOut, TIn, TInGEMM, TOutGEMM>; - using OutputTransform = typename WinogradBase::template OutputTransform<TOutGEMM, TOut>; - - /* Public member variables. */ WeightsTransform weights_transform; /** Operator to transform weights to Winograd domain. */ InputTransform _input_transform; /** Operator to transform input to Winograd domain. */ + const arm_gemm::GemmArgs gemm_args; arm_gemm::UniqueGemmCommon<TInGEMM, TOutGEMM> gemms; /** Operator to perform multiple GEMMs. */ OutputTransform _output_transform; /** Operator to transform output from Winograd domain. */ + public: + /** Determine how much memory (in units of TIn) to allocate for the * transformed weights. */ @@ -186,6 +181,7 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ const int n_output_channels, /** Number of feature maps in the output tensor. */ const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ + const arm_gemm::Activation &activation, const TIn* const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ TInGEMM* const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ const TIn* const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ @@ -201,8 +197,8 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer unsigned int weight_transform_get_window(void) const; void weight_transform_run(const unsigned int start, const unsigned int stop); - ITransform& input_transform(void); - ITransform& output_transform(void); + IInputTransform& input_transform(void); + IOutputTransform& output_transform(void); /* Get a pointer to the GEMM underlying the Winograd transform. */ arm_gemm::IGemmCommon *gemm(void); |