diff options
author | Pablo Tello <pablo.tello@arm.com> | 2018-02-14 12:47:30 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:47:18 +0000 |
commit | f6c572ce404c8ac99b0b00c65b757fbadab33dc1 (patch) | |
tree | 0678208f2f333312095d0780c821097196091f87 /arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h | |
parent | 91d20d95df35961d3eb5de497007d98576118d19 (diff) | |
download | ComputeLibrary-f6c572ce404c8ac99b0b00c65b757fbadab33dc1.tar.gz |
COMPMID-784: Productise Winograd.
a) Added support for kernel size 5.
b) Templatised data type for transforms and batched gemms kernels.
Change-Id: Idb83dda7a5eec19e015888ab31902bd791913297
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120540
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h')
-rw-r--r-- | arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h | 349 |
1 files changed, 273 insertions, 76 deletions
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h index a8645dc07e..9169b75d19 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -34,8 +34,8 @@ namespace arm_compute { class ITensor; -template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> -class NEWinogradLayerTransformInputKernel : public INEKernel +template <typename T> +class INEWinogradLayerTransformInputKernel : public INEKernel { public: /** Determine how much memory (in units of TIn) to allocate for the @@ -47,14 +47,68 @@ public: * @param[in] n_cols Number of columns in each feature map. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_input_storage_size( + virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] input Input tensor data + * @param[in] n_batches Number of batches in input tensor. + * @param[in] n_rows Number of rows in input tensor. + * @param[in] n_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + */ + virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0; + + virtual ~INEWinogradLayerTransformInputKernel() + { + } +}; + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel<T> +{ +public: + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. + * + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_channels Number of feature maps in the input tensor. + * @param[in] n_rows Number of rows in each feature map. + * @param[in] n_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_input_storage_size( int n_batches, int n_channels, int n_rows, int n_cols, - bool same_padding); + bool same_padding) const override; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; NEWinogradLayerTransformInputKernel(); + const char *name() const override { return "NEWinogradLayerTransformInputKernel"; @@ -72,28 +126,29 @@ public: * @param[in] matrix_stride Stride between output matrices. */ void configure( - const float *const input, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels, - const PaddingType padding, - float *const output, - const int matrix_stride); + const T *const input, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels, + const PaddingType padding, + T *const output, + const int matrix_stride) override; // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelCols, KernelCols>; + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + private: - using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelCols, KernelCols>; - using WinogradConv = typename WinogradBase::template Convolution<float, float>; - using InputTransform = typename WinogradBase::template InputTransform<float>; + using InputTransform = typename WinogradBase::template InputTransform<T>; std::unique_ptr<InputTransform> _transform; }; -template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> -class NEWinogradLayerTransformOutputKernel : public INEKernel +template <typename T> +class INEWinogradLayerTransformOutputKernel : public INEKernel { public: /** Determine how much memory (in units of TOut) to allocate for the @@ -105,13 +160,58 @@ public: * @param[in] n_output_channels Number of feature maps in the output tensor. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - static unsigned int get_output_storage_size( - int n_batches, - int n_rows, - int n_cols, - int n_output_channels, - bool same_padding); + virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + virtual Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() + * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_rows Number of rows in output tensor. + * @param[in] n_cols Number of columns in output tensor. + * @param[in] n_channels Number of feature maps in the output tensor. + */ + virtual void configure( + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) = 0; + + virtual ~INEWinogradLayerTransformOutputKernel() + { + } +}; +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel<T> +{ +public: const char *name() const override { return "NEWinogradLayerTransformOutputKernel"; @@ -130,6 +230,37 @@ public: ~NEWinogradLayerTransformOutputKernel() = default; + // Inherited methods overridden: + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. + * + * @param[in] n_batches Number of batches in the output tensor. + * @param[in] n_rows Number of rows in each feature map of the input tensor. + * @param[in] n_cols Number of columns in each feature map of the input tensor. + * @param[in] n_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + */ + unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] input_shape The shape of the input tensor. + * @param[in] padding_type The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; + /** Get the output shape of a convolution. + * + * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] in_shape The shape of the input tensor. + * @param[in] padding The type of padding to be used. + * + * @return Stride expressed in bytes. + */ + Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const override; + /** Configure the output transform kernel. * * @param[in] biases Pointer to the biases tensor. @@ -142,53 +273,54 @@ public: * @param[in] n_channels Number of feature maps in the output tensor. */ void configure( - const ITensor *biases, - const float *const output_workingspace, - const int matrix_stride, - float *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels); + const ITensor *biases, + const T *const output_workingspace, + const int matrix_stride, + T *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>; - using WinogradConv = typename WinogradBase::template Convolution<float, float>; - using OutputTransform = typename WinogradBase::template OutputTransform<float>; + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + using OutputTransform = typename WinogradBase::template OutputTransform<T>; const ITensor *_biases; - const float *_output_workspace; + const T *_output_workspace; int _matrix_stride; int _matrix_row_stride; - float *_output; + T *_output; int _n_batches; int _n_rows; int _n_cols; int _n_channels; }; -template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> -class NEWinogradLayerTransformWeightsKernel final : public INEKernel +template <typename T> +class INEWinogradLayerTransformWeightsKernel : public INEKernel { public: - /** Determine how much memory (in units of TIn) to allocate for the + /** Determine how much memory (in units of T) to allocate for the * transformed weights. * * @param[in] n_output_channels Number of output feature maps. * @param[in] n_input_channels Number of input feature maps. */ - static unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels); + virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0; + /** Gets the stride between matrices in the kernel worspace + * + * @param[in] kernel_shape The shape of the weights tensor. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(const KernelShape &kernel_shape) const = 0; - NEWinogradLayerTransformWeightsKernel(); - const char *name() const override - { - return "NEWinogradLayerTransformWeightsKernel"; - } - /** Configure the output transform kernel. + /** Configure the weights transform kernel. * * @param[in] weights_hwio Pointer to the weights tensor * @param[in] output Pointer to working space for the output tensor in the Winograd domain. @@ -196,53 +328,119 @@ public: * @param[in] n_output_channels Number of filters. * @param[in] n_input_channels Number of channels in each filter. */ - void configure( - const ITensor *weights_hwio, - float *const output, - const int matrix_stride, - const int n_output_channels, - const int n_input_channels); + virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) = 0; - // Inherited methods overridden: + virtual ~INEWinogradLayerTransformWeightsKernel() + { + } +}; + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel<T> +{ +public: + NEWinogradLayerTransformWeightsKernel(); + const char *name() const override + { + return "NEWinogradLayerTransformWeightsKernel"; + } + // Inherited methods overridden: + void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) override; + unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const override; + int get_matrix_stride(const KernelShape &kernel_shape) const override; void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; private: using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>; - using WinogradConv = typename WinogradBase::template Convolution<float, float>; - using WeightsTransform = typename WinogradBase::template WeightsTransform<float>; + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + using WeightsTransform = typename WinogradBase::template WeightsTransform<T>; std::unique_ptr<WeightsTransform> _transform; }; -template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> -class NEWinogradLayerKernel : public INEKernel +template <typename TIn, typename TOut> +class INEWinogradLayerBatchedGEMMKernel : public INEKernel +{ +public: + /** Get the number of GEMMs to compute + */ + virtual unsigned int get_number_gemms() const = 0; + /** Initialise the kernel + * + * @param[in] n_gemms Number of GEMMs to compute. + * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] K Number of channels in the input tensor. + * @param[in] N Number of channels in the output tensor. + * @param[in] a_matrix_stride Stride between input matrices. + * @param[in] a_row_stride Row stride inside input matrix. + * @param[in] b_matrix_stride Stride between weights matrices. + * @param[in] b_row_stride Row stride inside the weights matrix. + * @param[in] c_matrix_stride Stride between output matrices. + * @param[in] c_row_stride Row stride inside the output matrix. + * @param[out] a_ptr Input workspace. + * @param[out] b_ptr Kernel workspace. + * @param[out] c_ptr Output workspace. + */ + virtual void configure( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) = 0; + + /** Get the number of tiles per row + */ + virtual int get_output_tile_rows() const = 0; + /** Get the number of tiles per columns + */ + virtual int get_output_tile_cols() const = 0; + /** Get the number of blocks + */ + virtual int get_number_blocks() const = 0; +}; + +template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel<TIn, TOut> { public: using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>; - using WinogradConv = typename WinogradBase::template Convolution<float, float>; - using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, float, float>; + using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>; + using MultiGEMM = winograd::BatchedBlockedGemm<WinogradConv::M_BLOCK, WinogradConv::N_BLOCK, TIn, TOut>; static const int _output_tile_rows = OutputTileRows; static const int _output_tile_cols = OutputTileCols; const char *name() const override { - return "NEWinogradLayerKernel"; + return "NEWinogradLayerBatchedGEMMKernel"; } /** Constructor */ - NEWinogradLayerKernel(); + NEWinogradLayerBatchedGEMMKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerKernel &operator=(const NEWinogradLayerKernel &) = delete; + NEWinogradLayerBatchedGEMMKernel &operator=(const NEWinogradLayerBatchedGEMMKernel &) = delete; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default; /** Allow instances of this class to be moved */ - NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default; + NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default; - ~NEWinogradLayerKernel() = default; + ~NEWinogradLayerBatchedGEMMKernel() = default; + + // Inherited methods overridden: + + unsigned int get_number_gemms() const override; + int get_output_tile_rows() const override; + int get_output_tile_cols() const override; + int get_number_blocks() const override; /** Initialise the kernel * @@ -263,17 +461,16 @@ public: void configure( const unsigned int n_gemms, const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const float *const a_ptr, - const float *const b_ptr, - float *const c_ptr); + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const TIn *const a_ptr, + const TIn *const b_ptr, + TOut *const c_ptr) override; - // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: |