From f6c572ce404c8ac99b0b00c65b757fbadab33dc1 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 14 Feb 2018 12:47:30 +0000 Subject: COMPMID-784: Productise Winograd. a) Added support for kernel size 5. b) Templatised data type for transforms and batched gemms kernels. Change-Id: Idb83dda7a5eec19e015888ab31902bd791913297 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120540 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- .../core/NEON/kernels/NEWinogradLayerKernel.h | 349 ++++++++++++++++----- .../convolution/winograd/transforms/input.hpp | 20 +- 2 files changed, 284 insertions(+), 85 deletions(-) (limited to 'arm_compute/core') diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h index a8645dc07e..9169b75d19 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -34,8 +34,8 @@ namespace arm_compute { class ITensor; -template -class NEWinogradLayerTransformInputKernel : public INEKernel +template +class INEWinogradLayerTransformInputKernel : public INEKernel { public: /** Determine how much memory (in units of TIn) to allocate for the @@ -47,14 +47,68 @@ public: * @param[in] n_cols Number of columns in each feature map. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_input_storage_size( + virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] input Input tensor data + * @param[in] n_batches Number of batches in input tensor. + * @param[in] n_rows Number of rows in input tensor. + * @param[in] n_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + */ + virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0; + + virtual ~INEWinogradLayerTransformInputKernel() + { + } +}; + +template +class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel +{ +public: + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. + * + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_channels Number of feature maps in the input tensor. + * @param[in] n_rows Number of rows in each feature map. + * @param[in] n_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_input_storage_size( int n_batches, int n_channels, int n_rows, int n_cols, - bool same_padding); + bool same_padding) const override; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; NEWinogradLayerTransformInputKernel(); + const char *name() const override { return "NEWinogradLayerTransformInputKernel"; @@ -72,28 +126,29 @@ public: * @param[in] matrix_stride Stride between output matrices. */ void configure( - const float *const input, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels, - const PaddingType padding, - float *const output, - const int matrix_stride); + const T *const input, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels, + const PaddingType padding, + T *const output, + const int matrix_stride) override; // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + private: - using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using InputTransform = typename WinogradBase::template InputTransform; + using InputTransform = typename WinogradBase::template InputTransform; std::unique_ptr _transform; }; -template -class NEWinogradLayerTransformOutputKernel : public INEKernel +template +class INEWinogradLayerTransformOutputKernel : public INEKernel { public: /** Determine how much memory (in units of TOut) to allocate for the @@ -105,13 +160,58 @@ public: * @param[in] n_output_channels Number of feature maps in the output tensor. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_output_storage_size( - int n_batches, - int n_rows, - int n_cols, - int n_output_channels, - bool same_padding); + virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() + * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_rows Number of rows in output tensor. + * @param[in] n_cols Number of columns in output tensor. + * @param[in] n_channels Number of feature maps in the output tensor. + */ + virtual void configure( + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) = 0; + + virtual ~INEWinogradLayerTransformOutputKernel() + { + } +}; +template +class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel +{ +public: const char *name() const override { return "NEWinogradLayerTransformOutputKernel"; @@ -130,6 +230,37 @@ public: ~NEWinogradLayerTransformOutputKernel() = default; + // Inherited methods overridden: + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. + * + * @param[in] n_batches Number of batches in the output tensor. + * @param[in] n_rows Number of rows in each feature map of the input tensor. + * @param[in] n_cols Number of columns in each feature map of the input tensor. + * @param[in] n_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const override; + /** Configure the output transform kernel. * * @param[in] biases Pointer to the biases tensor. @@ -142,53 +273,54 @@ public: * @param[in] n_channels Number of feature maps in the output tensor. */ void configure( - const ITensor *biases, - const float *const output_workingspace, - const int matrix_stride, - float *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels); + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using OutputTransform = typename WinogradBase::template OutputTransform; + using WinogradConv = typename WinogradBase::template Convolution; + using OutputTransform = typename WinogradBase::template OutputTransform; const ITensor *_biases; - const float *_output_workspace; + const T *_output_workspace; int _matrix_stride; int _matrix_row_stride; - float *_output; + T *_output; int _n_batches; int _n_rows; int _n_cols; int _n_channels; }; -template -class NEWinogradLayerTransformWeightsKernel final : public INEKernel +template +class INEWinogradLayerTransformWeightsKernel : public INEKernel { public: - /** Determine how much memory (in units of TIn) to allocate for the + /** Determine how much memory (in units of T) to allocate for the * transformed weights. * * @param[in] n_output_channels Number of output feature maps. * @param[in] n_input_channels Number of input feature maps. */ - static unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels); + virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0; + /** Gets the stride between matrices in the kernel worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape) const = 0; - NEWinogradLayerTransformWeightsKernel(); - const char *name() const override - { - return "NEWinogradLayerTransformWeightsKernel"; - } - /** Configure the output transform kernel. + /** Configure the weights transform kernel. * * @param[in] weights_hwio Pointer to the weights tensor * @param[in] output Pointer to working space for the output tensor in the Winograd domain. @@ -196,53 +328,119 @@ public: * @param[in] n_output_channels Number of filters. * @param[in] n_input_channels Number of channels in each filter. */ - void configure( - const ITensor *weights_hwio, - float *const output, - const int matrix_stride, - const int n_output_channels, - const int n_input_channels); + virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) = 0; - // Inherited methods overridden: + virtual ~INEWinogradLayerTransformWeightsKernel() + { + } +}; + +template +class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel +{ +public: + NEWinogradLayerTransformWeightsKernel(); + const char *name() const override + { + return "NEWinogradLayerTransformWeightsKernel"; + } + // Inherited methods overridden: + void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) override; + unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const override; + int get_matrix_stride(const KernelShape &kernel_shape) const override; void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using WeightsTransform = typename WinogradBase::template WeightsTransform; + using WinogradConv = typename WinogradBase::template Convolution; + using WeightsTransform = typename WinogradBase::template WeightsTransform; std::unique_ptr _transform; }; -template -class NEWinogradLayerKernel : public INEKernel +template +class INEWinogradLayerBatchedGEMMKernel : public INEKernel +{ +public: + /** Get the number of GEMMs to compute + */ + virtual unsigned int get_number_gemms() const = 0; + /** Initialise the kernel + * + * @param[in] n_gemms Number of GEMMs to compute. + * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] K Number of channels in the input tensor. + * @param[in] N Number of channels in the output tensor. + * @param[in] a_matrix_stride Stride between input matrices. + * @param[in] a_row_stride Row stride inside input matrix. + * @param[in] b_matrix_stride Stride between weights matrices. + * @param[in] b_row_stride Row stride inside the weights matrix. + * @param[in] c_matrix_stride Stride between output matrices. + * @param[in] c_row_stride Row stride inside the output matrix. + * @param[out] a_ptr Input workspace. + * @param[out] b_ptr Kernel workspace. + * @param[out] c_ptr Output workspace. + */ + virtual void configure( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) = 0; + + /** Get the number of tiles per row + */ + virtual int get_output_tile_rows() const = 0; + /** Get the number of tiles per columns + */ + virtual int get_output_tile_cols() const = 0; + /** Get the number of blocks + */ + virtual int get_number_blocks() const = 0; +}; + +template +class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel { public: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using MultiGEMM = winograd::BatchedBlockedGemm; + using WinogradConv = typename WinogradBase::template Convolution; + using MultiGEMM = winograd::BatchedBlockedGemm; static const int _output_tile_rows = OutputTileRows; static const int _output_tile_cols = OutputTileCols; const char *name() const override { - return "NEWinogradLayerKernel"; + return "NEWinogradLayerBatchedGEMMKernel"; } /** Constructor */ - NEWinogradLayerKernel(); + NEWinogradLayerBatchedGEMMKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel &operator=(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel &operator=(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default; - ~NEWinogradLayerKernel() = default; + ~NEWinogradLayerBatchedGEMMKernel() = default; + + // Inherited methods overridden: + + unsigned int get_number_gemms() const override; + int get_output_tile_rows() const override; + int get_output_tile_cols() const override; + int get_number_blocks() const override; /** Initialise the kernel * @@ -263,17 +461,16 @@ public: void configure( const unsigned int n_gemms, const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const float *const a_ptr, - const float *const b_ptr, - float *const c_ptr); + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp index 6dd8f5460a..fc4b255a9c 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp @@ -23,7 +23,7 @@ */ #pragma once -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "../winograd_gemm.hpp" namespace winograd { @@ -45,9 +45,8 @@ namespace winograd ) { // Compute the padding required on each edge of the image - const bool base_padding = (padding_type == PADDING_SAME) ? 1 : 0; - const int pad_top = base_padding; - const int pad_left = base_padding; + const int pad_top = (padding_type == PADDING_SAME) ? (kernel_rows - 1) / 2 : 0; + const int pad_left = (padding_type == PADDING_SAME) ? (kernel_cols - 1) / 2 : 0; const int tile_overlap = kernel_rows - 1; // Compute striding values (assuming NHWC ordered data) @@ -68,8 +67,7 @@ namespace winograd for (int tile_i = 0; tile_i < tile_M; tile_i++) { // Pointer to the row - const int row_offset = (tile_i == 0) ? - 0 : ((padding_type == PADDING_VALID) ? 0 : 1); + const int row_offset = (tile_i == 0) ? 0 : pad_top; const T* const input_base_row = ( input_base_batch + ((inner_tile_rows - (kernel_rows - 1))*tile_i - row_offset)*input_row_stride ); @@ -129,7 +127,9 @@ namespace winograd T* const outptr = matrix_base + tile_j*matrix_row_stride; // Apply the specific tile processing function - tile_fns[pad_top][t_pad_left][pad_bottom][t_pad_right]( + const int f_pad_top = pad_top ? 1 : 0; + const int f_pad_left = t_pad_left ? 1 : 0; + tile_fns[f_pad_top][f_pad_left][pad_bottom][t_pad_right]( n_channels, input_base_col, input_row_stride, @@ -156,8 +156,10 @@ namespace winograd ) : _inptr(input), _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels), _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride), - _tiles_M(iceildiv((padding == PADDING_SAME) ? n_rows : n_rows - 2, output_tile_rows)), - _tiles_N(iceildiv((padding == PADDING_SAME) ? n_cols : n_cols - 2, output_tile_cols)), + _tiles_M(iceildiv((padding == PADDING_SAME) ? n_rows : n_rows - kr + 1, + output_tile_rows)), + _tiles_N(iceildiv((padding == PADDING_SAME) ? n_cols : n_cols - kc + 1, + output_tile_cols)), _padding_type(padding) { } -- cgit v1.2.1