From f6c572ce404c8ac99b0b00c65b757fbadab33dc1 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 14 Feb 2018 12:47:30 +0000 Subject: COMPMID-784: Productise Winograd. a) Added support for kernel size 5. b) Templatised data type for transforms and batched gemms kernels. Change-Id: Idb83dda7a5eec19e015888ab31902bd791913297 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120540 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- .../core/NEON/kernels/NEWinogradLayerKernel.h | 349 ++++++++++++++++----- .../convolution/winograd/transforms/input.hpp | 20 +- .../runtime/NEON/functions/NEWinogradLayer.h | 18 +- docs/00_introduction.dox | 11 +- src/core/NEON/kernels/NEWinogradLayerKernel.cpp | 204 +++++++----- src/runtime/NEON/functions/NEWinogradLayer.cpp | 110 ++++--- tests/datasets/SmallConvolutionLayerDataset.h | 4 + 7 files changed, 512 insertions(+), 204 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h index a8645dc07e..9169b75d19 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -34,8 +34,8 @@ namespace arm_compute { class ITensor; -template -class NEWinogradLayerTransformInputKernel : public INEKernel +template +class INEWinogradLayerTransformInputKernel : public INEKernel { public: /** Determine how much memory (in units of TIn) to allocate for the @@ -47,14 +47,68 @@ public: * @param[in] n_cols Number of columns in each feature map. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_input_storage_size( + virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] input Input tensor data + * @param[in] n_batches Number of batches in input tensor. + * @param[in] n_rows Number of rows in input tensor. + * @param[in] n_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + */ + virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0; + + virtual ~INEWinogradLayerTransformInputKernel() + { + } +}; + +template +class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel +{ +public: + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. + * + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_channels Number of feature maps in the input tensor. + * @param[in] n_rows Number of rows in each feature map. + * @param[in] n_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_input_storage_size( int n_batches, int n_channels, int n_rows, int n_cols, - bool same_padding); + bool same_padding) const override; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; NEWinogradLayerTransformInputKernel(); + const char *name() const override { return "NEWinogradLayerTransformInputKernel"; @@ -72,28 +126,29 @@ public: * @param[in] matrix_stride Stride between output matrices. */ void configure( - const float *const input, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels, - const PaddingType padding, - float *const output, - const int matrix_stride); + const T *const input, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels, + const PaddingType padding, + T *const output, + const int matrix_stride) override; // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + private: - using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using InputTransform = typename WinogradBase::template InputTransform; + using InputTransform = typename WinogradBase::template InputTransform; std::unique_ptr _transform; }; -template -class NEWinogradLayerTransformOutputKernel : public INEKernel +template +class INEWinogradLayerTransformOutputKernel : public INEKernel { public: /** Determine how much memory (in units of TOut) to allocate for the @@ -105,13 +160,58 @@ public: * @param[in] n_output_channels Number of feature maps in the output tensor. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_output_storage_size( - int n_batches, - int n_rows, - int n_cols, - int n_output_channels, - bool same_padding); + virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() + * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_rows Number of rows in output tensor. + * @param[in] n_cols Number of columns in output tensor. + * @param[in] n_channels Number of feature maps in the output tensor. + */ + virtual void configure( + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) = 0; + + virtual ~INEWinogradLayerTransformOutputKernel() + { + } +}; +template +class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel +{ +public: const char *name() const override { return "NEWinogradLayerTransformOutputKernel"; @@ -130,6 +230,37 @@ public: ~NEWinogradLayerTransformOutputKernel() = default; + // Inherited methods overridden: + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. + * + * @param[in] n_batches Number of batches in the output tensor. + * @param[in] n_rows Number of rows in each feature map of the input tensor. + * @param[in] n_cols Number of columns in each feature map of the input tensor. + * @param[in] n_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const override; + /** Configure the output transform kernel. * * @param[in] biases Pointer to the biases tensor. @@ -142,53 +273,54 @@ public: * @param[in] n_channels Number of feature maps in the output tensor. */ void configure( - const ITensor *biases, - const float *const output_workingspace, - const int matrix_stride, - float *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels); + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using OutputTransform = typename WinogradBase::template OutputTransform; + using WinogradConv = typename WinogradBase::template Convolution; + using OutputTransform = typename WinogradBase::template OutputTransform; const ITensor *_biases; - const float *_output_workspace; + const T *_output_workspace; int _matrix_stride; int _matrix_row_stride; - float *_output; + T *_output; int _n_batches; int _n_rows; int _n_cols; int _n_channels; }; -template -class NEWinogradLayerTransformWeightsKernel final : public INEKernel +template +class INEWinogradLayerTransformWeightsKernel : public INEKernel { public: - /** Determine how much memory (in units of TIn) to allocate for the + /** Determine how much memory (in units of T) to allocate for the * transformed weights. * * @param[in] n_output_channels Number of output feature maps. * @param[in] n_input_channels Number of input feature maps. */ - static unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels); + virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0; + /** Gets the stride between matrices in the kernel worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape) const = 0; - NEWinogradLayerTransformWeightsKernel(); - const char *name() const override - { - return "NEWinogradLayerTransformWeightsKernel"; - } - /** Configure the output transform kernel. + /** Configure the weights transform kernel. * * @param[in] weights_hwio Pointer to the weights tensor * @param[in] output Pointer to working space for the output tensor in the Winograd domain. @@ -196,53 +328,119 @@ public: * @param[in] n_output_channels Number of filters. * @param[in] n_input_channels Number of channels in each filter. */ - void configure( - const ITensor *weights_hwio, - float *const output, - const int matrix_stride, - const int n_output_channels, - const int n_input_channels); + virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) = 0; - // Inherited methods overridden: + virtual ~INEWinogradLayerTransformWeightsKernel() + { + } +}; + +template +class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel +{ +public: + NEWinogradLayerTransformWeightsKernel(); + const char *name() const override + { + return "NEWinogradLayerTransformWeightsKernel"; + } + // Inherited methods overridden: + void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) override; + unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const override; + int get_matrix_stride(const KernelShape &kernel_shape) const override; void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using WeightsTransform = typename WinogradBase::template WeightsTransform; + using WinogradConv = typename WinogradBase::template Convolution; + using WeightsTransform = typename WinogradBase::template WeightsTransform; std::unique_ptr _transform; }; -template -class NEWinogradLayerKernel : public INEKernel +template +class INEWinogradLayerBatchedGEMMKernel : public INEKernel +{ +public: + /** Get the number of GEMMs to compute + */ + virtual unsigned int get_number_gemms() const = 0; + /** Initialise the kernel + * + * @param[in] n_gemms Number of GEMMs to compute. + * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] K Number of channels in the input tensor. + * @param[in] N Number of channels in the output tensor. + * @param[in] a_matrix_stride Stride between input matrices. + * @param[in] a_row_stride Row stride inside input matrix. + * @param[in] b_matrix_stride Stride between weights matrices. + * @param[in] b_row_stride Row stride inside the weights matrix. + * @param[in] c_matrix_stride Stride between output matrices. + * @param[in] c_row_stride Row stride inside the output matrix. + * @param[out] a_ptr Input workspace. + * @param[out] b_ptr Kernel workspace. + * @param[out] c_ptr Output workspace. + */ + virtual void configure( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) = 0; + + /** Get the number of tiles per row + */ + virtual int get_output_tile_rows() const = 0; + /** Get the number of tiles per columns + */ + virtual int get_output_tile_cols() const = 0; + /** Get the number of blocks + */ + virtual int get_number_blocks() const = 0; +}; + +template +class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel { public: using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using MultiGEMM = winograd::BatchedBlockedGemm; + using WinogradConv = typename WinogradBase::template Convolution; + using MultiGEMM = winograd::BatchedBlockedGemm; static const int _output_tile_rows = OutputTileRows; static const int _output_tile_cols = OutputTileCols; const char *name() const override { - return "NEWinogradLayerKernel"; + return "NEWinogradLayerBatchedGEMMKernel"; } /** Constructor */ - NEWinogradLayerKernel(); + NEWinogradLayerBatchedGEMMKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel &operator=(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel &operator=(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default; - ~NEWinogradLayerKernel() = default; + ~NEWinogradLayerBatchedGEMMKernel() = default; + + // Inherited methods overridden: + + unsigned int get_number_gemms() const override; + int get_output_tile_rows() const override; + int get_output_tile_cols() const override; + int get_number_blocks() const override; /** Initialise the kernel * @@ -263,17 +461,16 @@ public: void configure( const unsigned int n_gemms, const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const float *const a_ptr, - const float *const b_ptr, - float *const c_ptr); + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp index 6dd8f5460a..fc4b255a9c 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp @@ -23,7 +23,7 @@ */ #pragma once -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "../winograd_gemm.hpp" namespace winograd { @@ -45,9 +45,8 @@ namespace winograd ) { // Compute the padding required on each edge of the image - const bool base_padding = (padding_type == PADDING_SAME) ? 1 : 0; - const int pad_top = base_padding; - const int pad_left = base_padding; + const int pad_top = (padding_type == PADDING_SAME) ? (kernel_rows - 1) / 2 : 0; + const int pad_left = (padding_type == PADDING_SAME) ? (kernel_cols - 1) / 2 : 0; const int tile_overlap = kernel_rows - 1; // Compute striding values (assuming NHWC ordered data) @@ -68,8 +67,7 @@ namespace winograd for (int tile_i = 0; tile_i < tile_M; tile_i++) { // Pointer to the row - const int row_offset = (tile_i == 0) ? - 0 : ((padding_type == PADDING_VALID) ? 0 : 1); + const int row_offset = (tile_i == 0) ? 0 : pad_top; const T* const input_base_row = ( input_base_batch + ((inner_tile_rows - (kernel_rows - 1))*tile_i - row_offset)*input_row_stride ); @@ -129,7 +127,9 @@ namespace winograd T* const outptr = matrix_base + tile_j*matrix_row_stride; // Apply the specific tile processing function - tile_fns[pad_top][t_pad_left][pad_bottom][t_pad_right]( + const int f_pad_top = pad_top ? 1 : 0; + const int f_pad_left = t_pad_left ? 1 : 0; + tile_fns[f_pad_top][f_pad_left][pad_bottom][t_pad_right]( n_channels, input_base_col, input_row_stride, @@ -156,8 +156,10 @@ namespace winograd ) : _inptr(input), _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels), _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride), - _tiles_M(iceildiv((padding == PADDING_SAME) ? n_rows : n_rows - 2, output_tile_rows)), - _tiles_N(iceildiv((padding == PADDING_SAME) ? n_cols : n_cols - 2, output_tile_cols)), + _tiles_M(iceildiv((padding == PADDING_SAME) ? n_rows : n_rows - kr + 1, + output_tile_rows)), + _tiles_N(iceildiv((padding == PADDING_SAME) ? n_cols : n_cols - kc + 1, + output_tile_cols)), _padding_type(padding) { } diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h index 63cac3a3b4..f57be697b5 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h @@ -26,7 +26,7 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" +#include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPPermute.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -38,6 +38,11 @@ namespace arm_compute { class ITensor; /** Basic function to simulate a convolution layer. This function calls the following NEON kernels: + * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) + * -# @ref NEWinogradLayerTransformInputKernel + * -# @ref NEWinogradLayerTransformOutputKernel + * -# @ref NEWinogradLayerBatchedGEMMKernel + * -# @ref CPPPermute (three times: weights, input and output) */ class NEWinogradLayer : public IFunction { @@ -68,11 +73,12 @@ public: NEWinogradLayer &operator=(const NEWinogradLayer &) = delete; private: - MemoryGroup _memory_group; - NEWinogradLayerKernel<2, 2, 3, 3> _winograd_kernel; - NEWinogradLayerTransformInputKernel<2, 2, 3, 3> _transform_input_kernel; - NEWinogradLayerTransformOutputKernel<2, 2, 3, 3> _transform_output_kernel; - NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3> _transform_weights_kernel; + MemoryGroup _memory_group; + std::unique_ptr _batched_gemm_kernel; + std::unique_ptr _transform_input_kernel; + std::unique_ptr _transform_output_kernel; + std::unique_ptr _transform_weights_kernel; + CPPPermute _permute_input; CPPPermute _permute_weights; CPPPermute _permute_output; diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 787b38dde0..c7faea7122 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -189,6 +189,13 @@ If there is more than one release in a month then an extra sequential number is @subsection S2_2_changelog Changelog +v18.02 Public maintenance release + - New NEON kernels / functions + - @ref arm_compute::NEWinogradLayerTransformInputKernel / @ref arm_compute::NEWinogradLayer + - @ref arm_compute::NEWinogradLayerTransformOutputKernel / @ref arm_compute::NEWinogradLayer + - @ref arm_compute::NEWinogradLayerTransformWeightsKernel / @ref arm_compute::NEWinogradLayer + - Renamed arm_compute::NEWinogradLayerKernel into @ref arm_compute::NEWinogradLayerBatchedGEMMKernel + v18.01 Public maintenance release - Various bug fixes - Added some of the missing validate() methods @@ -205,7 +212,7 @@ v18.01 Public maintenance release - @ref arm_compute::GCGEMMInterleave4x4Kernel - @ref arm_compute::GCGEMMTranspose1xWKernel - @ref arm_compute::GCIm2ColKernel - - Refactored NEON Winograd (@ref arm_compute::NEWinogradLayerKernel) + - Refactored NEON Winograd (arm_compute::NEWinogradLayerKernel) - Added @ref arm_compute::NEDirectConvolutionLayerOutputStageKernel - Added QASYMM8 support to the following NEON kernels: - @ref arm_compute::NEDepthwiseConvolutionLayer3x3Kernel @@ -256,7 +263,7 @@ v17.12 Public major release - @ref arm_compute::NEGEMMLowpOffsetContributionKernel / @ref arm_compute::NEGEMMLowpMatrixAReductionKernel / @ref arm_compute::NEGEMMLowpMatrixBReductionKernel / @ref arm_compute::NEGEMMLowpMatrixMultiplyCore - @ref arm_compute::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref arm_compute::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - @ref arm_compute::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel / @ref arm_compute::NEGEMMLowpQuantizeDownInt32ToUint8Scale - - @ref arm_compute::NEWinogradLayerKernel / @ref arm_compute::NEWinogradLayer + - @ref arm_compute::NEWinogradLayer / arm_compute::NEWinogradLayerKernel - New OpenCL kernels / functions - @ref arm_compute::CLGEMMLowpOffsetContributionKernel / @ref arm_compute::CLGEMMLowpMatrixAReductionKernel / @ref arm_compute::CLGEMMLowpMatrixBReductionKernel / @ref arm_compute::CLGEMMLowpMatrixMultiplyCore diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp index b0a36ff46a..b2e44f8e09 100644 --- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp @@ -32,25 +32,25 @@ namespace arm_compute { //Batched Gemms -template -NEWinogradLayerKernel::NEWinogradLayerKernel() +template +NEWinogradLayerBatchedGEMMKernel::NEWinogradLayerBatchedGEMMKernel() : _gemms() { } -template -void NEWinogradLayerKernel::configure( +template +void NEWinogradLayerBatchedGEMMKernel::configure( const unsigned int n_gemms, const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const float *const a_ptr, - const float *const b_ptr, - float *const c_ptr) + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) { _gemms = support::cpp14::make_unique(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr); Window win; @@ -59,8 +59,8 @@ void NEWinogradLayerKernel -void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerBatchedGEMMKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -69,36 +69,66 @@ void NEWinogradLayerKernelrun(first_gemm, last_gemm); } -template class NEWinogradLayerKernel<2, 2, 3, 3>; +template +unsigned int NEWinogradLayerBatchedGEMMKernel::get_number_gemms() const +{ + return WinogradBase::N_GEMMS; +} + +template +int NEWinogradLayerBatchedGEMMKernel::get_output_tile_rows() const +{ + return _output_tile_rows; +} + +template +int NEWinogradLayerBatchedGEMMKernel::get_output_tile_cols() const +{ + return _output_tile_cols; +} + +template +int NEWinogradLayerBatchedGEMMKernel::get_number_blocks() const +{ + return WinogradConv::N_BLOCK; +} + +template class NEWinogradLayerBatchedGEMMKernel; +template class NEWinogradLayerBatchedGEMMKernel; // Weights transform -template -unsigned int NEWinogradLayerTransformWeightsKernel::get_weight_storage_size(int n_output_channels, int n_input_channels) +template +unsigned int NEWinogradLayerTransformWeightsKernel::get_weight_storage_size(int n_output_channels, int n_input_channels) const { const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels); return static_cast( - // WinogradConv returns the size in bytes, we divide by `sizeof(float)` to - // express that in units of float. - WinogradConv::get_kernel_storage_size(shape) / sizeof(float)); + // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T + WinogradConv::get_kernel_storage_size(shape) / sizeof(T)); } -template -NEWinogradLayerTransformWeightsKernel::NEWinogradLayerTransformWeightsKernel() +template +NEWinogradLayerTransformWeightsKernel::NEWinogradLayerTransformWeightsKernel() : _transform() { } -template -void NEWinogradLayerTransformWeightsKernel::configure( +template +int NEWinogradLayerTransformWeightsKernel::get_matrix_stride(const KernelShape &kernel_shape) const +{ + return WinogradConv::get_kernel_matrix_stride(kernel_shape); +} + +template +void NEWinogradLayerTransformWeightsKernel::configure( const ITensor *weights_hwio, - float *const output, + T *const output, const int matrix_stride, /** Stride across matrices in the output. */ const int n_output_channels, /** Number of filters. */ const int n_input_channels) /** Number of channels in each filter. */ { const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK); - _transform = support::cpp14::make_unique(reinterpret_cast(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels, + _transform = support::cpp14::make_unique(reinterpret_cast(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels, n_input_channels); Window win; auto win_last = _transform->get_window(); @@ -106,8 +136,8 @@ void NEWinogradLayerTransformWeightsKernel -void NEWinogradLayerTransformWeightsKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformWeightsKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -116,50 +146,57 @@ void NEWinogradLayerTransformWeightsKernelrun(fst, lst); } -template -bool NEWinogradLayerTransformWeightsKernel::is_parallelisable() const +template +bool NEWinogradLayerTransformWeightsKernel::is_parallelisable() const { return false; } -template class NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3>; +template class NEWinogradLayerTransformWeightsKernel; +template class NEWinogradLayerTransformWeightsKernel; // Input transform -template -unsigned int NEWinogradLayerTransformInputKernel::get_input_storage_size( +template +unsigned int NEWinogradLayerTransformInputKernel::get_input_storage_size( int n_batches, /** Number of batches in the input tensor. */ int n_channels, /** Number of feature maps in the input tensor. */ int n_rows, /** Number of rows in each feature map. */ int n_cols, /** Number of columns in each feature map. */ bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) +) const { // Construct shapes for the input and kernel tensors. const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels); const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels); const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; // Return the size, converted into units of TIn - return static_cast( - WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(float)); + return static_cast(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T)); +} + +template +int NEWinogradLayerTransformInputKernel::get_matrix_stride( + const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const +{ + return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type); } -template -NEWinogradLayerTransformInputKernel::NEWinogradLayerTransformInputKernel() +template +NEWinogradLayerTransformInputKernel::NEWinogradLayerTransformInputKernel() : _transform() { } -template -void NEWinogradLayerTransformInputKernel::configure( - const float *const input, /** Input tensor data */ - const int n_batches, /** Number of batches in input tensor. */ - const int n_rows, /** Number of rows in input tensor. */ - const int n_cols, /** Number of columns in input tensor. */ - const int n_channels, /** Number of channels in input tensor. */ - const PaddingType padding, /** Padding type. */ - float *const output, /** Base of output matrices. */ - const int matrix_stride) /** Stride between output matrices. */ +template +void NEWinogradLayerTransformInputKernel::configure( + const T *const input, /** Input tensor data */ + const int n_batches, /** Number of batches in input tensor. */ + const int n_rows, /** Number of rows in input tensor. */ + const int n_cols, /** Number of columns in input tensor. */ + const int n_channels, /** Number of channels in input tensor. */ + const PaddingType padding, /** Padding type. */ + T *const output, /** Base of output matrices. */ + const int matrix_stride) /** Stride between output matrices. */ { // _input_matrix_row_stride(n_input_channels), _transform = support::cpp14::make_unique(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels); @@ -169,8 +206,8 @@ void NEWinogradLayerTransformInputKernel -void NEWinogradLayerTransformInputKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformInputKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -179,24 +216,25 @@ void NEWinogradLayerTransformInputKernelrun(fst, lst); } -template -bool NEWinogradLayerTransformInputKernel::is_parallelisable() const +template +bool NEWinogradLayerTransformInputKernel::is_parallelisable() const { return false; } -template class NEWinogradLayerTransformInputKernel<2, 2, 3, 3>; +template class NEWinogradLayerTransformInputKernel; +template class NEWinogradLayerTransformInputKernel; // Output transform -template -unsigned int NEWinogradLayerTransformOutputKernel::get_output_storage_size( +template +unsigned int NEWinogradLayerTransformOutputKernel::get_output_storage_size( int n_batches, /** Number of batches in the output tensor. */ int n_rows, /** Number of rows in each feature map of the input tensor. */ int n_cols, /** Number of columns in each feature map of the input tensor. */ int n_output_channels, /** Number of feature maps in the output tensor. */ bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) +) const { // Construct shapes for the input and kernel tensors. const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1); @@ -205,25 +243,38 @@ unsigned int NEWinogradLayerTransformOutputKernel( - WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(float)); + WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T)); } -template -NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() +template +NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0) { } -template -void NEWinogradLayerTransformOutputKernel::configure( - const ITensor *biases, - const float *const output_workingspace, - const int matrix_stride, - float *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels) +template +int NEWinogradLayerTransformOutputKernel::get_matrix_stride( + const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const +{ + return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type); +} +template +Tensor4DShape NEWinogradLayerTransformOutputKernel::get_output_shape( + const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const +{ + return WinogradConv::get_output_shape(kernel_shape, in_shape, padding); +} + +template +void NEWinogradLayerTransformOutputKernel::configure( + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) { _biases = biases; _output_workspace = output_workingspace; @@ -243,8 +294,8 @@ void NEWinogradLayerTransformOutputKernel -void NEWinogradLayerTransformOutputKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformOutputKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -253,7 +304,7 @@ void NEWinogradLayerTransformOutputKernel(_biases->buffer()), _output, + reinterpret_cast(_biases->buffer()), _output, _n_batches, _n_rows, _n_cols, _n_channels); // The code below cannot be moved to configure because biases hasn't been allocated at that point @@ -262,12 +313,13 @@ void NEWinogradLayerTransformOutputKernel -bool NEWinogradLayerTransformOutputKernel::is_parallelisable() const +template +bool NEWinogradLayerTransformOutputKernel::is_parallelisable() const { return false; } -template class NEWinogradLayerTransformOutputKernel<2, 2, 3, 3>; +template class NEWinogradLayerTransformOutputKernel; +template class NEWinogradLayerTransformOutputKernel; } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index 215f1bfddf..e343583b36 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -28,6 +28,8 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "support/ToolchainSupport.h" +#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" + #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" namespace @@ -45,8 +47,9 @@ inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input) namespace arm_compute { NEWinogradLayer::NEWinogradLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _winograd_kernel(), _transform_input_kernel(), _transform_output_kernel(), _transform_weights_kernel(), _permute_input(), _permute_weights(), - _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false) + : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(), + _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), + _reshaped_kernel(false) { } /* arm_compute */ @@ -54,7 +57,7 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 || weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported"); + ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5, "Only 3 and 5 kernels are supported"); ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); if(biases != nullptr) @@ -67,6 +70,36 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _input = input; _output = output; + std::unique_ptr> batched_gemm_kernel; + std::unique_ptr> transform_input_kernel; + std::unique_ptr> transform_weights_kernel; + std::unique_ptr> transform_output_kernel; + + switch(weights->info()->dimension(0)) + { + case 3: + { + batched_gemm_kernel = support::cpp14::make_unique>(); + transform_input_kernel = support::cpp14::make_unique>(); + transform_weights_kernel = support::cpp14::make_unique>(); + transform_output_kernel = support::cpp14::make_unique>(); + break; + } + case 5: + { + batched_gemm_kernel = support::cpp14::make_unique>(); + transform_input_kernel = support::cpp14::make_unique>(); + transform_weights_kernel = support::cpp14::make_unique>(); + transform_output_kernel = support::cpp14::make_unique>(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported."); + break; + } + } + const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID; const bool use_same_padding = use_padding_type == PADDING_SAME; @@ -84,22 +117,19 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. constexpr size_t storage_alignment = 64; - const size_t kernel_storage_size = NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3>::get_weight_storage_size(out_channels, in_channels) * data_type_size; + const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_kernel_storage); _memory_group.manage(&_input_nhwc); _kernel_storage.allocator()->allocate(); // Input storage - - using IT = NEWinogradLayerTransformInputKernel<2, 2, 3, 3>; - const size_t input_storage_size = IT::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; + const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_input_workspace); _input_workspace.allocator()->allocate(); // Output storage - using OT = NEWinogradLayerTransformOutputKernel<2, 2, 3, 3>; - const size_t output_storage_size = OT::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size; + const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size; _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_output_workspace); _output_workspace.allocator()->allocate(); @@ -119,47 +149,57 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); - using T = winograd::WinogradGEMM<2, 2, 3, 3>::Convolution; const int weights_width = weights->info()->dimension(0); const int weights_height = weights->info()->dimension(1); const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels }); // Configure the InputTransform - const int input_matrix_stride = T::get_input_matrix_stride(kernel_shape, in_shape, use_padding_type); - _transform_input_kernel.configure(reinterpret_cast(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, + const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type); + transform_input_kernel->configure(reinterpret_cast(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); // Configure WeightsTransform - const int kernel_matrix_stride = T::get_kernel_matrix_stride(kernel_shape); - _transform_weights_kernel.configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); + transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); // Configure OutputTransform //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method - const int output_matrix_stride = T::get_output_matrix_stride(kernel_shape, in_shape, use_padding_type); - const auto output_shape(T::get_output_shape(kernel_shape, in_shape, use_padding_type)); + const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type); + const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type)); - _transform_output_kernel.configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), output_matrix_stride, reinterpret_cast(_output_nhwc.buffer()), in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); // Configure Batched GEMMs - const int tile_rows = iceildiv(output_shape.n_rows, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_cols); - const int m = in_shape.n_batches * tile_rows * tile_cols; - const int k = in_shape.n_channels; - const int n = out_channels; - const int input_matrix_row_stride = in_shape.n_channels; - const int kernel_matrix_row_stride = roundup(out_channels, NEWinogradLayerKernel<2, 2, 3, 3>::WinogradConv::N_BLOCK); - const int output_matrix_row_stride = kernel_matrix_row_stride; - - _winograd_kernel.configure(NEWinogradLayerKernel<2, 2, 3, 3>::WinogradBase::N_GEMMS, m, k, n, - input_matrix_stride, input_matrix_row_stride, - kernel_matrix_stride, kernel_matrix_row_stride, - output_matrix_stride, output_matrix_row_stride, - reinterpret_cast(_input_workspace.buffer()), reinterpret_cast(_kernel_storage.buffer()), reinterpret_cast(_output_workspace.buffer())); + const int output_tile_rows = batched_gemm_kernel->get_output_tile_rows(); + const int output_tile_cols = batched_gemm_kernel->get_output_tile_cols(); + const int n_block = batched_gemm_kernel->get_number_blocks(); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int m = in_shape.n_batches * tile_rows * tile_cols; + const int k = in_shape.n_channels; + const int n = out_channels; + const int input_matrix_row_stride = in_shape.n_channels; + const int kernel_matrix_row_stride = roundup(out_channels, n_block); + const int output_matrix_row_stride = kernel_matrix_row_stride; + const unsigned n_gemms = batched_gemm_kernel->get_number_gemms(); + + batched_gemm_kernel->configure(n_gemms, m, k, n, + input_matrix_stride, input_matrix_row_stride, + kernel_matrix_stride, kernel_matrix_row_stride, + output_matrix_stride, output_matrix_row_stride, + reinterpret_cast(_input_workspace.buffer()), + reinterpret_cast(_kernel_storage.buffer()), + reinterpret_cast(_output_workspace.buffer())); // Reorder the convoluted output to ACL's ordering NCHW _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); + + _transform_input_kernel = std::move(transform_input_kernel); + _transform_weights_kernel = std::move(transform_weights_kernel); + _transform_output_kernel = std::move(transform_output_kernel); + _batched_gemm_kernel = std::move(batched_gemm_kernel); } void NEWinogradLayer::run() @@ -169,19 +209,19 @@ void NEWinogradLayer::run() { _reshaped_kernel = true; _permute_weights.run(); - NEScheduler::get().schedule(&_transform_weights_kernel, Window::DimX); + NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); } //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC _permute_input.run(); // Transform input tensor to the winograd domain - NEScheduler::get().schedule(&_transform_input_kernel, Window::DimX); + NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX); //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - NEScheduler::get().schedule(&_winograd_kernel, Window::DimX); + NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX); // Transform output tensor to the spatial domain - NEScheduler::get().schedule(&_transform_output_kernel, Window::DimX); + NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); // Reorder the convoluted output to ACL's ordering NCHW _permute_output.run(); diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h index 1ddf6094d5..d88efbd21b 100644 --- a/tests/datasets/SmallConvolutionLayerDataset.h +++ b/tests/datasets/SmallConvolutionLayerDataset.h @@ -42,11 +42,15 @@ class SmallWinogradLayerDataset final : public ConvolutionLayerDataset public: SmallWinogradLayerDataset() { + // Kernel size 3 // Batch size 1 add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(6U, 6U, 1U), PadStrideInfo(1, 1, 0, 0)); // Batch size 4 add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0)); add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1)); + + // Kernel size 5 + add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0)); } }; -- cgit v1.2.1