From 52140b42f4f663da7f4537abbdebd13df541bcea Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Tue, 30 Jan 2018 14:48:11 +0000 Subject: COMPMID-784: Winograd tramsforms refactoring 1) Removed the example files winograd_layer.hpp/cpp 2) Teplatized winograd transform kernels Change-Id: I7045fa0b801b9d30a11275914aaa2dafd254aed2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118332 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../core/NEON/kernels/NEWinogradLayerKernel.h | 259 +++++++++++---------- .../core/NEON/kernels/winograd/winograd_layer.hpp | 129 ---------- .../runtime/NEON/functions/NEWinogradLayer.h | 37 ++- src/core/NEON/kernels/NEWinogradLayerKernel.cpp | 242 ++++++++++--------- src/core/NEON/kernels/winograd/winograd_gemm.cpp | 4 +- src/core/NEON/kernels/winograd/winograd_layer.cpp | 206 ---------------- src/runtime/NEON/functions/NEWinogradLayer.cpp | 64 ++--- 7 files changed, 324 insertions(+), 617 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp delete mode 100644 src/core/NEON/kernels/winograd/winograd_layer.cpp diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h index ea6c8d813d..97532f3574 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -25,104 +25,93 @@ #define __ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__ #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/winograd/batched_blocked_gemm.hpp" #include "arm_compute/core/NEON/kernels/winograd/convolution.hpp" #include "arm_compute/core/NEON/kernels/winograd/tensor.hpp" +#include "arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp" namespace arm_compute { class ITensor; -class NEWinogradLayerKernel; -class NEWinogradLayerTransformInputKernel; -class NEWinogradLayerTransformWeightsKernel; -class Winograd3x3F32 final +template +class NEWinogradLayerTransformInputKernel : public INEKernel { public: - /** Create a new Winograd convolution layer. + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. * - * @param[in] n_batches Number of batches in the input and output tensors. - * @param[in] n_input_channels Number of feature maps in a batch of the input tensor. - * @param[in] n_input_rows Number of rows in a feature map of the input tensor. - * @param[in] n_input_cols Number of columns in a feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * @param[in] weights Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. - * @param[out] weights_storage Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size - * @param[in] input Pointer to NHWC ordered input tensor, in the spatial domain. - * @param[out] winograd_input Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. - * @param[in] biases Pointer to the biases vector. - * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. - * @param[out] winograd_output Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_channels Number of feature maps in the input tensor. + * @param[in] n_rows Number of rows in each feature map. + * @param[in] n_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - friend class NEWinogradLayerKernel; - friend class NEWinogradLayerTransformInputKernel; - friend class NEWinogradLayerTransformOutputKernel; - friend class NEWinogradLayerTransformWeightsKernel; + static unsigned int get_input_storage_size( + int n_batches, + int n_channels, + int n_rows, + int n_cols, + bool same_padding); - Winograd3x3F32( - const int n_batches, - const int n_input_channels, - const int n_input_rows, - const int n_input_cols, - const int n_output_channels, - const bool same_padding, - const float *const weights, - float *const weights_storage, + NEWinogradLayerTransformInputKernel(); + const char *name() const override + { + return "NEWinogradLayerTransformInputKernel"; + } + + /** Configure the output transform kernel. + * + * @param[in] input Input tensor data + * @param[in] n_batches Number of batches in input tensor. + * @param[in] n_rows Number of rows in input tensor. + * @param[in] n_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + */ + void configure( const float *const input, - float *const winograd_input, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels, + const PaddingType padding, float *const output, - float *const winograd_output); + const int matrix_stride); - ~Winograd3x3F32(); + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; private: - class Private; - std::unique_ptr _pimpl; + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + using InputTransform = typename WinogradBase::template InputTransform; + std::unique_ptr _transform; }; -class INEWinogradLayerTransformKernel : public INEKernel +template +class NEWinogradLayerTransformOutputKernel : public INEKernel { public: - /** Constructor */ - INEWinogradLayerTransformKernel(); - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - INEWinogradLayerTransformKernel(const INEWinogradLayerTransformKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - INEWinogradLayerTransformKernel &operator=(const INEWinogradLayerTransformKernel &) = delete; - /** Allow instances of this class to be moved */ - INEWinogradLayerTransformKernel(INEWinogradLayerTransformKernel &&) = default; - /** Allow instances of this class to be moved */ - INEWinogradLayerTransformKernel &operator=(INEWinogradLayerTransformKernel &&) = default; - - virtual ~INEWinogradLayerTransformKernel() = default; - - /** Initialise the kernel + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. * - * @param[in] convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS . + * @param[in] n_batches Number of batches in the output tensor. + * @param[in] n_rows Number of rows in each feature map of the input tensor. + * @param[in] n_cols Number of columns in each feature map of the input tensor. + * @param[in] n_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ - virtual void configure(Winograd3x3F32 *convolver); - -protected: - Winograd3x3F32 *_convolver; -}; - -class NEWinogradLayerTransformInputKernel final : public INEWinogradLayerTransformKernel -{ -public: - const char *name() const override - { - return "NEWinogradLayerTransformInputKernel"; - } - // Inherited methods overridden: - void configure(Winograd3x3F32 *convolver) override; - void run(const Window &window, const ThreadInfo &info) override; - bool is_parallelisable() const override; -}; + static unsigned int get_output_storage_size( + int n_batches, + int n_rows, + int n_cols, + int n_output_channels, + bool same_padding); -class NEWinogradLayerTransformOutputKernel final : public INEKernel -{ -public: const char *name() const override { return "NEWinogradLayerTransformOutputKernel"; @@ -167,6 +156,10 @@ public: bool is_parallelisable() const override; private: + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + using OutputTransform = typename WinogradBase::template OutputTransform; + const ITensor *_biases; const float *_output_workspace; int _matrix_stride; @@ -178,22 +171,61 @@ private: int _n_channels; }; -class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformKernel +template +class NEWinogradLayerTransformWeightsKernel final : public INEKernel { public: + /** Determine how much memory (in units of TIn) to allocate for the + * transformed weights. + * + * @param[in] n_output_channels Number of output feature maps. + * @param[in] n_input_channels Number of input feature maps. + */ + static unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels); + + NEWinogradLayerTransformWeightsKernel(); const char *name() const override { return "NEWinogradLayerTransformWeightsKernel"; } + /** Configure the output transform kernel. + * + * @param[in] weights_hwio Pointer to the weights tensor + * @param[in] output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Stride across matrices in the output workspace. + * @param[in] n_output_channels Number of filters. + * @param[in] n_input_channels Number of channels in each filter. + */ + void configure( + const ITensor *weights_hwio, + float *const output, + const int matrix_stride, + const int n_output_channels, + const int n_input_channels); + // Inherited methods overridden: - void configure(Winograd3x3F32 *convolver) override; + void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; + +private: + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + using WeightsTransform = typename WinogradBase::template WeightsTransform; + std::unique_ptr _transform; }; -class NEWinogradLayerKernel final : public INEKernel +template +class NEWinogradLayerKernel : public INEKernel { public: + using WinogradBase = winograd::WinogradGEMM; + using WinogradConv = typename WinogradBase::template Convolution; + using MultiGEMM = winograd::BatchedBlockedGemm; + + static const int _output_tile_rows = OutputTileRows; + static const int _output_tile_cols = OutputTileCols; + const char *name() const override { return "NEWinogradLayerKernel"; @@ -214,57 +246,38 @@ public: /** Initialise the kernel * - * @param[in] convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS . + * @param[in] n_gemms Number of GEMMs to compute. + * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] K Number of channels in the input tensor. + * @param[in] N Number of channels in the output tensor. + * @param[in] a_matrix_stride Stride between input matrices. + * @param[in] a_row_stride Row stride inside input matrix. + * @param[in] b_matrix_stride Stride between weights matrices. + * @param[in] b_row_stride Row stride inside the weights matrix. + * @param[in] c_matrix_stride Stride between output matrices. + * @param[in] c_row_stride Row stride inside the output matrix. + * @param[out] a_ptr Input workspace. + * @param[out] b_ptr Kernel workspace. + * @param[out] c_ptr Output workspace. */ - void configure(Winograd3x3F32 *convolver); + void configure( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const float *const a_ptr, + const float *const b_ptr, + float *const c_ptr); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - /** Determine how much memory (in units of TIn) to allocate for the - * transformed weights. - * - * @param[in] n_output_channels Number of output feature maps. - * @param[in] n_input_channels Number of input feature maps. - */ - static unsigned int get_weight_storage_size( - const int n_output_channels, - const int n_input_channels); - - /** Determine how much memory (in units of TIn) to allocate for the - * transformed input. - * - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_channels Number of feature maps in the input tensor. - * @param[in] n_rows Number of rows in each feature map. - * @param[in] n_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - */ - static unsigned int get_input_storage_size( - const int n_batches, - const int n_channels, - const int n_rows, - const int n_cols, - const bool same_padding); - - /** Determine how much memory (in units of TOut) to allocate for the - * (Winograd domain) output. - * - * @param[in] n_batches Number of batches in the output tensor. - * @param[in] n_rows Number of rows in each feature map of the input tensor. - * @param[in] n_cols Number of columns in each feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - */ - static unsigned int get_output_storage_size( - const int n_batches, - const int n_rows, - const int n_cols, - const int n_output_channels, - const bool same_padding); - -protected: - Winograd3x3F32 *_convolver; +private: + std::unique_ptr _gemms; }; } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp deleted file mode 100644 index 1db63d750b..0000000000 --- a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include - -#include "batched_blocked_gemm.hpp" -#include "winograd_gemm.hpp" - -/** Example of how to construct an ACL-like interface. - * - * Use `get_weight_storage_size`, `get_input_storage_size` and - * `get_output_storage_size` to allocate memory for the convolution engine. - * Then create a `WinogradConvolutionLayer`. - * - * Initialise the weights using `weights_transform.run(...)`. - * - * For each inference: - * 1. Transform the inputs to the Winograd domain using `input_transform.run(...)` - * 2. Perform a number of GEMMs using `gemms.run(...)` - * 3. Transform the output to the spatial domain using `output_transform.run(...)` - */ -template -class WinogradConvolutionLayer -{ - private: - const KernelShape _kernel_shape; - const Tensor4DShape _input_shape; - const PaddingType _padding; - const Tensor4DShape _output_shape; - const int _n_output_rows, _n_output_cols; - const int _kernel_matrix_stride, _kernel_matrix_row_stride; - const int _input_matrix_stride, _input_matrix_row_stride; - const int _output_matrix_stride, _output_matrix_row_stride; - const int _tile_rows, _tile_cols; - const int _m, _k, _n; - - public: - using WinogradBase = winograd::WinogradGEMM; - using WeightsTransform = typename WinogradBase::template WeightsTransform; - using InputTransform = typename WinogradBase::template InputTransform; - using WinogradConv = typename WinogradBase::template Convolution; - using MultiGEMM = winograd::BatchedBlockedGemm; - using OutputTransform = typename WinogradBase::template OutputTransform; - - /* Public member variables. */ - WeightsTransform weights_transform; /** Operator to transform weights to Winograd domain. */ - InputTransform input_transform; /** Operator to transform input to Winograd domain. */ - MultiGEMM gemms; /** Operator to perform multiple GEMMs. */ - OutputTransform output_transform; /** Operator to transform output from Winograd domain. */ - - /** Determine how much memory (in units of TIn) to allocate for the - * transformed weights. - */ - static unsigned int get_weight_storage_size( - const int n_output_channels, /** Number of output feature maps. */ - const int n_input_channels /** Number of input feature maps. */ - ); - - /** Determine how much memory (in units of TIn) to allocate for the - * transformed input. - */ - static unsigned int get_input_storage_size( - const int n_batches, /** Number of batches in the input tensor. */ - const int n_channels, /** Number of feature maps in the input tensor. */ - const int n_rows, /** Number of rows in each feature map. */ - const int n_cols, /** Number of columns in each feature map. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ - ); - - /** Determine how much memory (in units of TOut) to allocate for the - * (Winograd domain) output. - */ - static unsigned int get_output_storage_size( - const int n_batches, /** Number of batches in the output tensor. */ - const int n_rows, /** Number of rows in each feature map of the input tensor. */ - const int n_cols, /** Number of columns in each feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ - ); - - /** Get the shape (rows, cols) of a feature map of the output tensor. */ - static std::pair get_output_feature_map_shape( - const int n_input_rows, /** Number of rows in the input feature map. */ - const int n_input_cols, /** Number of columns in the input feature map. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ - ); - - /** Create a new Winograd convolution layer. - */ - WinogradConvolutionLayer( - const int n_batches, /** Number of batches in the input and output tensors. */ - const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ - const int n_input_rows, /** Number of rows in a feature map of the input tensor. */ - const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ - const TIn* const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ - TIn* const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ - const TIn* const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ - TIn* const winograd_input, /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */ - const TOut* const biases, /** Pointer to biases vector. */ - TOut* const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ - TOut* const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ - ); -}; diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h index 1682495f0d..63cac3a3b4 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h @@ -68,25 +68,24 @@ public: NEWinogradLayer &operator=(const NEWinogradLayer &) = delete; private: - MemoryGroup _memory_group; - NEWinogradLayerKernel _winograd_kernel; - NEWinogradLayerTransformInputKernel _transform_input_kernel; - NEWinogradLayerTransformOutputKernel _transform_output_kernel; - NEWinogradLayerTransformWeightsKernel _transform_weights_kernel; - CPPPermute _permute_input; - CPPPermute _permute_weights; - CPPPermute _permute_output; - Tensor _input_workspace; - Tensor _output_workspace; - Tensor _kernel_storage; - Tensor _input_nhwc; - Tensor _output_nhwc; - Tensor _weights_hwio; - const ITensor *_input; - const ITensor *_weights; - ITensor *_output; - bool _reshaped_kernel; - std::unique_ptr _conv; + MemoryGroup _memory_group; + NEWinogradLayerKernel<2, 2, 3, 3> _winograd_kernel; + NEWinogradLayerTransformInputKernel<2, 2, 3, 3> _transform_input_kernel; + NEWinogradLayerTransformOutputKernel<2, 2, 3, 3> _transform_output_kernel; + NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3> _transform_weights_kernel; + CPPPermute _permute_input; + CPPPermute _permute_weights; + CPPPermute _permute_output; + Tensor _input_workspace; + Tensor _output_workspace; + Tensor _kernel_storage; + Tensor _input_nhwc; + Tensor _output_nhwc; + Tensor _weights_hwio; + const ITensor *_input; + const ITensor *_weights; + ITensor *_output; + bool _reshaped_kernel; }; } #endif /* __ARM_COMPUTE_NEWINOGRADLAYER_H__ */ diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp index e2e4e40fe4..b0a36ff46a 100644 --- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp @@ -29,173 +29,193 @@ #include "arm_compute/core/TensorInfo.h" #include "support/ToolchainSupport.h" -#include "arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp" - -namespace -{ -using T = WinogradConvolutionLayer<2, 2, 3, 3, float, float>; -} // namespace - namespace arm_compute { -class Winograd3x3F32::Private -{ -public: - Private( - const int n_batches, /** Number of batches in the input and output tensors. */ - const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ - const int n_input_rows, /** Number of rows in a feature map of the input tensor. */ - const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ - const float *const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ - float *const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ - const float *const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ - float *const winograd_input, /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */ - float *const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ - float *const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ - ) - : convolver(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, nullptr, output, winograd_output) - { - } - T convolver; -}; - -Winograd3x3F32::~Winograd3x3F32() -{ -} - -Winograd3x3F32::Winograd3x3F32( - const int n_batches, /** Number of batches in the input and output tensors. */ - const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ - const int n_input_rows, /** Number of rows in a feature map of the input tensor. */ - const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ - const float *const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ - float *const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ - const float *const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ - float *const winograd_input, /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */ - float *const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ - float *const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ -) - : _pimpl(support::cpp14::make_unique(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output, - winograd_output)) -{ -} - -unsigned int NEWinogradLayerKernel::get_input_storage_size(const int n_batches, const int n_channels, const int n_rows, const int n_cols, const bool same_padding) -{ - return T::get_input_storage_size(n_batches, n_channels, n_rows, n_cols, same_padding); -} - -unsigned int NEWinogradLayerKernel::get_output_storage_size( - const int n_batches, /** Number of batches in the output tensor. */ - const int n_rows, /** Number of rows in each feature map of the input tensor. */ - const int n_cols, /** Number of columns in each feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) +//Batched Gemms +template +NEWinogradLayerKernel::NEWinogradLayerKernel() + : _gemms() { - return T::get_output_storage_size(n_batches, n_rows, n_cols, n_output_channels, same_padding); } -unsigned int NEWinogradLayerKernel::get_weight_storage_size(const int n_output_channels, const int n_input_channels) +template +void NEWinogradLayerKernel::configure( + const unsigned int n_gemms, + const int M, const int K, const int N, + const int a_matrix_stride, + const int a_row_stride, + const int b_matrix_stride, + const int b_row_stride, + const int c_matrix_stride, + const int c_row_stride, + const float *const a_ptr, + const float *const b_ptr, + float *const c_ptr) { - return T::get_weight_storage_size(n_output_channels, n_input_channels); -} - -NEWinogradLayerKernel::NEWinogradLayerKernel() - : _convolver(nullptr) -{ -} - -void NEWinogradLayerKernel::configure(Winograd3x3F32 *convolver) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(convolver); - _convolver = convolver; + _gemms = support::cpp14::make_unique(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr); Window win; - auto win_last = _convolver->_pimpl->convolver.gemms.get_window(); + auto win_last = _gemms->get_window(); win.set(Window::DimX, Window::Dimension(0, win_last, 1)); INEKernel::configure(win); } -void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); const size_t first_gemm = window.x().start(); const size_t last_gemm = window.x().end(); - _convolver->_pimpl->convolver.gemms.run(first_gemm, last_gemm); + _gemms->run(first_gemm, last_gemm); } -INEWinogradLayerTransformKernel::INEWinogradLayerTransformKernel() - : _convolver(nullptr) +template class NEWinogradLayerKernel<2, 2, 3, 3>; + +// Weights transform + +template +unsigned int NEWinogradLayerTransformWeightsKernel::get_weight_storage_size(int n_output_channels, int n_input_channels) { + const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels); + return static_cast( + // WinogradConv returns the size in bytes, we divide by `sizeof(float)` to + // express that in units of float. + WinogradConv::get_kernel_storage_size(shape) / sizeof(float)); } -void INEWinogradLayerTransformKernel::configure(Winograd3x3F32 *convolver) +template +NEWinogradLayerTransformWeightsKernel::NEWinogradLayerTransformWeightsKernel() + : _transform() { - ARM_COMPUTE_ERROR_ON_NULLPTR(convolver); - _convolver = convolver; } -// Weights transform - -void NEWinogradLayerTransformWeightsKernel::configure(Winograd3x3F32 *convolver) +template +void NEWinogradLayerTransformWeightsKernel::configure( + const ITensor *weights_hwio, + float *const output, + const int matrix_stride, /** Stride across matrices in the output. */ + const int n_output_channels, /** Number of filters. */ + const int n_input_channels) /** Number of channels in each filter. */ { - INEWinogradLayerTransformKernel::configure(convolver); + const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK); + _transform = support::cpp14::make_unique(reinterpret_cast(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels, + n_input_channels); Window win; - auto win_last = _convolver->_pimpl->convolver.weights_transform.get_window(); + auto win_last = _transform->get_window(); win.set(Window::DimX, Window::Dimension(0, win_last, 1)); INEKernel::configure(win); } -void NEWinogradLayerTransformWeightsKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformWeightsKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); const size_t fst = window.x().start(); const size_t lst = window.x().end(); - _convolver->_pimpl->convolver.weights_transform.run(fst, lst); + _transform->run(fst, lst); } -bool NEWinogradLayerTransformWeightsKernel::is_parallelisable() const +template +bool NEWinogradLayerTransformWeightsKernel::is_parallelisable() const { return false; } +template class NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3>; + // Input transform -void NEWinogradLayerTransformInputKernel::configure(Winograd3x3F32 *convolver) +template +unsigned int NEWinogradLayerTransformInputKernel::get_input_storage_size( + int n_batches, /** Number of batches in the input tensor. */ + int n_channels, /** Number of feature maps in the input tensor. */ + int n_rows, /** Number of rows in each feature map. */ + int n_cols, /** Number of columns in each feature map. */ + bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ +) +{ + // Construct shapes for the input and kernel tensors. + const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels); + const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels); + const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; + // Return the size, converted into units of TIn + return static_cast( + WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(float)); +} + +template +NEWinogradLayerTransformInputKernel::NEWinogradLayerTransformInputKernel() + : _transform() { - INEWinogradLayerTransformKernel::configure(convolver); +} + +template +void NEWinogradLayerTransformInputKernel::configure( + const float *const input, /** Input tensor data */ + const int n_batches, /** Number of batches in input tensor. */ + const int n_rows, /** Number of rows in input tensor. */ + const int n_cols, /** Number of columns in input tensor. */ + const int n_channels, /** Number of channels in input tensor. */ + const PaddingType padding, /** Padding type. */ + float *const output, /** Base of output matrices. */ + const int matrix_stride) /** Stride between output matrices. */ +{ + // _input_matrix_row_stride(n_input_channels), + _transform = support::cpp14::make_unique(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels); Window win; - auto win_last = _convolver->_pimpl->convolver.input_transform.get_window(); + auto win_last = _transform->get_window(); win.set(Window::DimX, Window::Dimension(0, win_last, 1)); INEKernel::configure(win); } -void NEWinogradLayerTransformInputKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformInputKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); const size_t fst = window.x().start(); const size_t lst = window.x().end(); - _convolver->_pimpl->convolver.input_transform.run(fst, lst); + _transform->run(fst, lst); } -bool NEWinogradLayerTransformInputKernel::is_parallelisable() const + +template +bool NEWinogradLayerTransformInputKernel::is_parallelisable() const { return false; } +template class NEWinogradLayerTransformInputKernel<2, 2, 3, 3>; + // Output transform -NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() + +template +unsigned int NEWinogradLayerTransformOutputKernel::get_output_storage_size( + int n_batches, /** Number of batches in the output tensor. */ + int n_rows, /** Number of rows in each feature map of the input tensor. */ + int n_cols, /** Number of columns in each feature map of the input tensor. */ + int n_output_channels, /** Number of feature maps in the output tensor. */ + bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ +) +{ + // Construct shapes for the input and kernel tensors. + const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1); + const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1); + const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; + + // Return the size, converted into units of TOut + return static_cast( + WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(float)); +} + +template +NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0) { } -void NEWinogradLayerTransformOutputKernel::configure( +template +void NEWinogradLayerTransformOutputKernel::configure( const ITensor *biases, const float *const output_workingspace, const int matrix_stride, @@ -205,13 +225,10 @@ void NEWinogradLayerTransformOutputKernel::configure( const int n_cols, const int n_channels) { - using WinogradBase = winograd::WinogradGEMM<2, 2, 3, 3>; - using OutputTransform = typename WinogradBase::template OutputTransform; - _biases = biases; _output_workspace = output_workingspace; _matrix_stride = matrix_stride; - _matrix_row_stride = roundup(n_channels, WinogradBase::Convolution::N_BLOCK); + _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK); _output = output; _n_batches = n_batches; _n_rows = n_rows; @@ -226,7 +243,8 @@ void NEWinogradLayerTransformOutputKernel::configure( INEKernel::configure(win); } -void NEWinogradLayerTransformOutputKernel::run(const Window &window, const ThreadInfo &info) +template +void NEWinogradLayerTransformOutputKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -234,9 +252,6 @@ void NEWinogradLayerTransformOutputKernel::run(const Window &window, const Threa ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace); ARM_COMPUTE_ERROR_ON_NULLPTR(_output); - using WinogradBase = winograd::WinogradGEMM<2, 2, 3, 3>; - using OutputTransform = typename WinogradBase::template OutputTransform; - OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, reinterpret_cast(_biases->buffer()), _output, _n_batches, _n_rows, _n_cols, _n_channels); @@ -247,9 +262,12 @@ void NEWinogradLayerTransformOutputKernel::run(const Window &window, const Threa output_transform.run(fst, lst); } -bool NEWinogradLayerTransformOutputKernel::is_parallelisable() const +template +bool NEWinogradLayerTransformOutputKernel::is_parallelisable() const { return false; } +template class NEWinogradLayerTransformOutputKernel<2, 2, 3, 3>; + } // namespace arm_compute diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/winograd/winograd_gemm.cpp index b45f6f55d9..05426450a6 100644 --- a/src/core/NEON/kernels/winograd/winograd_gemm.cpp +++ b/src/core/NEON/kernels/winograd/winograd_gemm.cpp @@ -36,8 +36,8 @@ Tensor4DShape WinogradGEMM::Convolution::get_output { return Tensor4DShape { in_shape.n_batches, - (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 2), - (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 2), + (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1), + (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1), kernel_shape.n_output_channels, in_shape.ordering }; diff --git a/src/core/NEON/kernels/winograd/winograd_layer.cpp b/src/core/NEON/kernels/winograd/winograd_layer.cpp deleted file mode 100644 index f16d62c0ef..0000000000 --- a/src/core/NEON/kernels/winograd/winograd_layer.cpp +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "convolution.hpp" -#include "winograd_layer.hpp" -#include "tensor.hpp" - - -/** Determine how much memory (in units of TIn) to allocate for the transformed - * weights. - */ -template < - int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, - typename TIn, typename TOut -> -unsigned int WinogradConvolutionLayer< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut ->::get_weight_storage_size( - const int n_output_channels, /** Number of output feature maps. */ - const int n_input_channels /** Number of input feature maps. */ -) -{ - const KernelShape shape( - n_output_channels, KernelRows, KernelCols, n_input_channels - ); - return static_cast( - // WinogradConv returns the size in bytes, we divide by `sizeof(TIn)` to - // express that in units of TIn. - WinogradConv::get_kernel_storage_size(shape) / sizeof(TIn) - ); -} - - -/** Determine how much memory (in units of TIn) to allocate for the transformed - * input. - */ -template < - int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, - typename TIn, typename TOut -> -unsigned int WinogradConvolutionLayer< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut ->::get_input_storage_size( - const int n_batches, /** Number of batches in the input tensor. */ - const int n_channels, /** Number of feature maps in the input tensor. */ - const int n_rows, /** Number of rows in each feature map. */ - const int n_cols, /** Number of columns in each feature map. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) -{ - // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels); - const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels); - const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; - - // Return the size, converted into units of TIn - return static_cast( - WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / - sizeof(TIn) - ); -} - - -/** Determine how much memory (in units of TOut) to allocate for the (Winograd - * domain) output. - */ -template < - int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, - typename TIn, typename TOut -> -unsigned int WinogradConvolutionLayer< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut ->::get_output_storage_size( - const int n_batches, /** Number of batches in the output tensor. */ - const int n_rows, /** Number of rows in each feature map of the input tensor. */ - const int n_cols, /** Number of columns in each feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) -{ - // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1); - const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1); - const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; - - // Return the size, converted into units of TOut - return static_cast( - WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / - sizeof(TOut) - ); -} - - -/** Get the shape (rows, cols) of a feature map of the output tensor. */ -template < - int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, - typename TIn, typename TOut -> -std::pair WinogradConvolutionLayer< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut ->::get_output_feature_map_shape( - const int n_input_rows, /** Number of rows in the input feature map. */ - const int n_input_cols, /** Number of columns in the input feature map. */ - const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ -) -{ - // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(1, n_input_rows, n_input_cols, 1); - const KernelShape kern_shape(1, KernelRows, KernelCols, 1); - const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; - - // Compute the new shape - const auto output_shape = WinogradConv::get_output_shape( - kern_shape, input_shape, padding - ); - - return std::make_pair(output_shape.n_rows, output_shape.n_cols); -} - - -/** Create a new Winograd convolution layer. - */ -template < - int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, - typename TIn, typename TOut -> -WinogradConvolutionLayer:: -WinogradConvolutionLayer( - const int n_batches, /** Number of batches in the input and output tensors. */ - const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ - const int n_input_rows, /** Number of rows in a feature map of the input tensor. */ - const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ - const int n_output_channels, /** Number of feature maps in the output tensor. */ - const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ - const TIn* const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ - TIn* const winograd_weights, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ - const TIn* const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ - TIn* const winograd_input, /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */ - const TOut* const biases, /** Pointer to biases vector. */ - TOut* const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ - TOut* const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ -) : _kernel_shape(n_output_channels, KernelRows, KernelCols, n_input_channels), - _input_shape(n_batches, n_input_rows, n_input_cols, n_input_channels), - _padding(same_padding ? PADDING_SAME : PADDING_VALID), - _output_shape(WinogradConv::get_output_shape(_kernel_shape, _input_shape, _padding)), - _n_output_rows(_output_shape.n_rows), - _n_output_cols(_output_shape.n_cols), - _kernel_matrix_stride(WinogradConv::get_kernel_matrix_stride(_kernel_shape)), - _kernel_matrix_row_stride(roundup(n_output_channels, WinogradConv::N_BLOCK)), - _input_matrix_stride(WinogradConv::get_input_matrix_stride(_kernel_shape, _input_shape, _padding)), - _input_matrix_row_stride(n_input_channels), - _output_matrix_stride(WinogradConv::get_output_matrix_stride(_kernel_shape, _input_shape, _padding)), - _output_matrix_row_stride(_kernel_matrix_row_stride), - _tile_rows(iceildiv(_n_output_rows, OutputTileRows)), - _tile_cols(iceildiv(_n_output_cols, OutputTileCols)), - _m(n_batches * _tile_rows * _tile_cols), - _k(n_input_channels), - _n(n_output_channels), - weights_transform( - weights, winograd_weights, - _kernel_matrix_stride, _kernel_matrix_row_stride, - n_output_channels, n_input_channels - ), - input_transform( - input, n_batches, n_input_rows, n_input_cols, n_input_channels, _padding, - winograd_input, _input_matrix_stride, _input_matrix_row_stride - ), - gemms( - WinogradBase::N_GEMMS, _m, _k, _n, - _input_matrix_stride, _input_matrix_row_stride, - _kernel_matrix_stride, _kernel_matrix_row_stride, - _output_matrix_stride, _output_matrix_row_stride, - winograd_input, winograd_weights, winograd_output - ), - output_transform( - winograd_output, _output_matrix_stride, _output_matrix_row_stride, biases, - output, n_batches, _n_output_rows, _n_output_cols, n_output_channels - ) -{ -} - -// Instantiate valid implementations. -template class WinogradConvolutionLayer<2, 2, 3, 3, float, float>; -template class WinogradConvolutionLayer<4, 4, 3, 3, float, float>; -template class WinogradConvolutionLayer<2, 2, 5, 5, float, float>; diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index e8c77412a2..6196c514a8 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -46,7 +46,7 @@ namespace arm_compute { NEWinogradLayer::NEWinogradLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _winograd_kernel(), _transform_input_kernel(), _transform_output_kernel(), _transform_weights_kernel(), _permute_input(), _permute_weights(), - _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv() + _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false) { } /* arm_compute */ @@ -81,19 +81,23 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. constexpr size_t storage_alignment = 64; - const size_t kernel_storage_size = NEWinogradLayerKernel::get_weight_storage_size(out_channels, in_channels) * data_type_size; + const size_t kernel_storage_size = NEWinogradLayerTransformWeightsKernel<2, 2, 3, 3>::get_weight_storage_size(out_channels, in_channels) * data_type_size; _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_kernel_storage); _memory_group.manage(&_input_nhwc); _kernel_storage.allocator()->allocate(); // Input storage - const size_t input_storage_size = NEWinogradLayerKernel::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, false) * data_type_size; + const size_t input_storage_size = NEWinogradLayerTransformInputKernel<2, 2, 3, 3>::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, + false) + * data_type_size; _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_input_workspace); _input_workspace.allocator()->allocate(); // Output storage - const size_t output_storage_size = NEWinogradLayerKernel::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, false) * data_type_size; + const size_t output_storage_size = NEWinogradLayerTransformOutputKernel<2, 2, 3, 3>::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, + false) + * data_type_size; _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _memory_group.manage(&_output_workspace); _output_workspace.allocator()->allocate(); @@ -132,38 +136,46 @@ void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, co _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); - // Create Winograd operator object - _conv = support::cpp14::make_unique( - in_shape.n_batches, - in_shape.n_channels, - in_shape.n_rows, - in_shape.n_cols, - out_channels, - false, - reinterpret_cast(_weights_hwio.buffer()), - reinterpret_cast(_kernel_storage.buffer()), - reinterpret_cast(_input_nhwc.buffer()), - reinterpret_cast(_input_workspace.buffer()), - reinterpret_cast(_output_nhwc.buffer()), - reinterpret_cast(_output_workspace.buffer())); - - // Configure the kernel, padding not needed so it's safe to call configure after allocare - _winograd_kernel.configure(_conv.get()); - _transform_input_kernel.configure(_conv.get()); - _transform_weights_kernel.configure(_conv.get()); - //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method using T = winograd::WinogradGEMM<2, 2, 3, 3>::Convolution; const int weights_width = weights->info()->dimension(0); const int weights_height = weights->info()->dimension(1); const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels }); - const int output_matrix_stride = T::get_output_matrix_stride(kernel_shape, in_shape, PADDING_VALID); - const auto output_shape(T::get_output_shape(kernel_shape, in_shape, PADDING_VALID)); + + // Configure the InputTransform + const int input_matrix_stride = T::get_input_matrix_stride(kernel_shape, in_shape, PADDING_VALID); + _transform_input_kernel.configure(reinterpret_cast(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, PADDING_VALID, + reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + + // Configure WeightsTransform + const int kernel_matrix_stride = T::get_kernel_matrix_stride(kernel_shape); + _transform_weights_kernel.configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + + // Configure OutputTransform + //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method + const int output_matrix_stride = T::get_output_matrix_stride(kernel_shape, in_shape, PADDING_VALID); + const auto output_shape(T::get_output_shape(kernel_shape, in_shape, PADDING_VALID)); _transform_output_kernel.configure(biases, reinterpret_cast(_output_workspace.buffer()), output_matrix_stride, reinterpret_cast(_output_nhwc.buffer()), in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); + // Configure Batched GEMMs + const int tile_rows = iceildiv(output_shape.n_rows, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_rows); + const int tile_cols = iceildiv(output_shape.n_cols, NEWinogradLayerKernel<2, 2, 3, 3>::_output_tile_cols); + const int m = in_shape.n_batches * tile_rows * tile_cols; + const int k = in_shape.n_channels; + const int n = out_channels; + const int input_matrix_row_stride = in_shape.n_channels; + const int kernel_matrix_row_stride = roundup(out_channels, NEWinogradLayerKernel<2, 2, 3, 3>::WinogradConv::N_BLOCK); + const int output_matrix_row_stride = kernel_matrix_row_stride; + + _winograd_kernel.configure(NEWinogradLayerKernel<2, 2, 3, 3>::WinogradBase::N_GEMMS, m, k, n, + input_matrix_stride, input_matrix_row_stride, + kernel_matrix_stride, kernel_matrix_row_stride, + output_matrix_stride, output_matrix_row_stride, + reinterpret_cast(_input_workspace.buffer()), reinterpret_cast(_kernel_storage.buffer()), reinterpret_cast(_output_workspace.buffer())); + // Reorder the convoluted output to ACL's ordering NCHW _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U)); } -- cgit v1.2.1