From 7df27869aff38b07b50e4fe589f6b2cf51954a92 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 30 May 2018 11:44:26 +0100 Subject: COMPMID-1162: Enable NHWC data layout support for NEWinogradConvolutionLayer - part1 In this first part we reworked the configuration of the kernels as before we passed the raw pointer to the buffer within the configuration of the function Change-Id: I83d3cb64c562303093c7f0ae52395ecd080a5d52 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/133560 Tested-by: Jenkins Reviewed-by: Giorgio Arena Reviewed-by: Gian Marco Iodice --- .../kernels/NEWinogradConvolutionLayerKernel.h | 198 ++++++++++++--------- .../kernels/NEWinogradConvolutionLayerKernel.cpp | 140 ++++++++------- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 78 +++++--- .../fixtures/WinogradConvolutionLayerFixture.h | 5 +- 4 files changed, 257 insertions(+), 164 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h index 6b8866cb2e..68c133ee37 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h @@ -42,15 +42,15 @@ public: /** Determine how much memory (in units of TIn) to allocate for the * transformed input. * - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_channels Number of feature maps in the input tensor. - * @param[in] n_rows Number of rows in each feature map. - * @param[in] n_cols Number of columns in each feature map. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TIn) required. */ - virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0; + virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; /** Gets the stride between matrices in the input worspace * @@ -64,16 +64,17 @@ public: /** Configure the output transform kernel. * - * @param[in] input Input tensor data - * @param[in] n_batches Number of batches in input tensor. - * @param[in] n_rows Number of rows in input tensor. - * @param[in] n_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input tensor. + * @param[in] input_nhwc Input tensor in NHWC data layout format. + * @param[in] num_batches Number of batches in input tensor. + * @param[in] num_rows Number of rows in input tensor. + * @param[in] num_cols Number of columns in input tensor. + * @param[in] num_channels Number of channels in input tensor. * @param[in] padding Padding type. * @param[out] output Base of output matrices. * @param[in] matrix_stride Stride between output matrices. */ - virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0; + virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels, + const PaddingType padding, T *const output, const int matrix_stride) = 0; /** Destructor */ virtual ~INEWinogradLayerTransformInputKernel() @@ -86,22 +87,33 @@ template { public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default; + /** Default destructor */ + ~NEWinogradLayerTransformInputKernel() = default; + /** Determine how much memory (in units of TIn) to allocate for the * transformed input. * - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_channels Number of feature maps in the input tensor. - * @param[in] n_rows Number of rows in each feature map. - * @param[in] n_cols Number of columns in each feature map. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TIn) required. */ unsigned int get_input_storage_size( - int n_batches, - int n_channels, - int n_rows, - int n_cols, + int num_batches, + int num_channels, + int num_rows, + int num_cols, bool same_padding) const override; /** Gets the stride between matrices in the input worspace @@ -124,21 +136,21 @@ public: /** Configure the output transform kernel. * - * @param[in] input Input tensor data. Data types supported: F32. - * @param[in] n_batches Number of batches in input tensor. - * @param[in] n_rows Number of rows in input tensor. - * @param[in] n_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input tensor. + * @param[in] input_nhwc Input tensor. Data types supported: F32. Layout supported NHWC. + * @param[in] num_batches Number of batches in input tensor. + * @param[in] num_rows Number of rows in input tensor. + * @param[in] num_cols Number of columns in input tensor. + * @param[in] num_channels Number of channels in input tensor. * @param[in] padding Padding type. * @param[out] output Base of output matrices. * @param[in] matrix_stride Stride between output matrices. */ void configure( - const T *const input, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels, + const ITensor *input_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, const PaddingType padding, T *const output, const int matrix_stride) override; @@ -163,7 +175,14 @@ public: private: using InputTransform = typename WinogradBase::template InputTransform; - std::unique_ptr _transform; + const ITensor *_input_nhwc; + int _num_batches; /**< Number of batches in input tensor. */ + int _num_rows; /**< Number of rows in input tensor. */ + int _num_cols; /**< Number of columns in input tensor. */ + int _num_channels; /**< Number of channels in input tensor. */ + PaddingType _padding; /**< Padding type. */ + T *_output; /**< Base of output matrices. */ + int _matrix_stride; /**< Stride between output matrices. */ }; /** Interface for the NEON kernel to perform Winograd output transform. */ @@ -174,15 +193,15 @@ public: /** Determine how much memory (in units of TOut) to allocate for the * (Winograd domain) output. * - * @param[in] n_batches Number of batches in the output tensor. - * @param[in] n_rows Number of rows in each feature map of the input tensor. - * @param[in] n_cols Number of columns in each feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TOut) required. */ - virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0; + virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const = 0; /** Gets the stride between matrices in the output worspace * @@ -209,21 +228,21 @@ public: * @param[in] biases Pointer to the biases tensor. * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() - * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_rows Number of rows in output tensor. - * @param[in] n_cols Number of columns in output tensor. - * @param[in] n_channels Number of feature maps in the output tensor. + * @param[out] output_nhwc Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_rows Number of rows in output tensor. + * @param[in] num_cols Number of columns in output tensor. + * @param[in] num_channels Number of feature maps in the output tensor. */ virtual void configure( const ITensor *biases, const T *const output_workingspace, const int matrix_stride, - T *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels) = 0; + ITensor *const output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels) = 0; virtual ~INEWinogradLayerTransformOutputKernel() { @@ -257,15 +276,15 @@ public: /** Determine how much memory (in units of TOut) to allocate for the * (Winograd domain) output. * - * @param[in] n_batches Number of batches in the output tensor. - * @param[in] n_rows Number of rows in each feature map of the input tensor. - * @param[in] n_cols Number of columns in each feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TOut) required. */ - unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override; + unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const override; /** Gets the stride between matrices in the output worspace * @@ -291,21 +310,21 @@ public: * @param[in] biases Pointer to the biases tensor. * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() - * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_rows Number of rows in output tensor. - * @param[in] n_cols Number of columns in output tensor. - * @param[in] n_channels Number of feature maps in the output tensor. + * @param[out] output_nhwc Pointer to a tensor with NHWC data layout, in the spatial domain. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_rows Number of rows in output tensor. + * @param[in] num_cols Number of columns in output tensor. + * @param[in] num_channels Number of feature maps in the output tensor. */ void configure( const ITensor *biases, const T *const output_workingspace, const int matrix_stride, - T *const output, - const int n_batches, - const int n_rows, - const int n_cols, - const int n_channels) override; + ITensor *const output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels) override; void run(const Window &window, const ThreadInfo &info) override; @@ -329,11 +348,11 @@ private: const T *_output_workspace; int _matrix_stride; int _matrix_row_stride; - T *_output; - int _n_batches; - int _n_rows; - int _n_cols; - int _n_channels; + ITensor *_output_nhwc; + int _num_batches; + int _num_rows; + int _num_cols; + int _num_channels; }; /** Interface for the NEON kernel to perform Winograd weights transform. */ @@ -344,12 +363,12 @@ public: /** Determine how much memory (in units of T) to allocate for the * transformed weights. * - * @param[in] n_output_channels Number of output feature maps. - * @param[in] n_input_channels Number of input feature maps. + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. * * @return Storage size (in units of T) required. */ - virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0; + virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0; /** Gets the stride between matrices in the kernel worspace * * @param[in] kernel_shape The shape of the weights tensor. @@ -360,13 +379,14 @@ public: /** Configure the weights transform kernel. * - * @param[in] weights_hwio Pointer to the weights tensor - * @param[in] output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Stride across matrices in the output workspace. - * @param[in] n_output_channels Number of filters. - * @param[in] n_input_channels Number of channels in each filter. + * @param[in] weights_hwio Pointer to the weights tensor + * @param[in] output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Stride across matrices in the output workspace. + * @param[in] num_output_channels Number of filters. + * @param[in] num_input_channels Number of channels in each filter. */ - virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) = 0; + + virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; virtual ~INEWinogradLayerTransformWeightsKernel() { @@ -378,6 +398,17 @@ template { public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default; + /** Default destructor */ + ~NEWinogradLayerTransformWeightsKernel() = default; + /** Default constructor. */ NEWinogradLayerTransformWeightsKernel(); const char *name() const override @@ -397,8 +428,8 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); // Inherited methods overridden: - void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) override; - unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const override; + void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; + unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override; int get_matrix_stride(const KernelShape &kernel_shape) const override; void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; @@ -407,7 +438,12 @@ private: using WinogradBase = winograd::WinogradGEMM; using WinogradConv = typename WinogradBase::template Convolution; using WeightsTransform = typename WinogradBase::template WeightsTransform; - std::unique_ptr _transform; + + const ITensor *_weights_hwio; + T *_output; + int _matrix_stride; + int _num_output_channels; + int _num_input_channels; }; /** Interface for the NEON kernel to perform Winograd. */ @@ -421,7 +457,7 @@ public: /** Initialise the kernel * * @param[in] n_gemms Number of GEMMs to compute. - * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] M in_shape.num_batches * tile_rows * tile_cols. * @param[in] K Number of channels in the input tensor. * @param[in] N Number of channels in the output tensor. * @param[in] a_matrix_stride Stride between input matrices. @@ -498,7 +534,7 @@ public: /** Initialise the kernel * * @param[in] n_gemms Number of GEMMs to compute. - * @param[in] M in_shape.n_batches * tile_rows * tile_cols. + * @param[in] M in_shape.num_batches * tile_rows * tile_cols. * @param[in] K Number of channels in the input tensor. * @param[in] N Number of channels in the output tensor. * @param[in] a_matrix_stride Stride between input matrices. diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp index 672684d14f..cfd53d7082 100644 --- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp @@ -309,9 +309,9 @@ template class NEWinogradLayerBatchedGEMMKernel; // Weights transform template -unsigned int NEWinogradLayerTransformWeightsKernel::get_weight_storage_size(int n_output_channels, int n_input_channels) const +unsigned int NEWinogradLayerTransformWeightsKernel::get_weight_storage_size(int num_output_channels, int num_input_channels) const { - const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels); + const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels); return static_cast( // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T WinogradConv::get_kernel_storage_size(shape) / sizeof(T)); @@ -319,7 +319,8 @@ unsigned int NEWinogradLayerTransformWeightsKernel NEWinogradLayerTransformWeightsKernel::NEWinogradLayerTransformWeightsKernel() - : _transform() + : _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0) + { } @@ -333,15 +334,20 @@ template ::configure( const ITensor *weights_hwio, T *const output, - const int matrix_stride, /** Stride across matrices in the output. */ - const int n_output_channels, /** Number of filters. */ - const int n_input_channels) /** Number of channels in each filter. */ -{ - const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK); - _transform = support::cpp14::make_unique(reinterpret_cast(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels, - n_input_channels); - Window win; - auto win_last = _transform->get_window(); + const int matrix_stride, /** Stride across matrices in the output. */ + const int num_output_channels, /** Number of filters. */ + const int num_input_channels) /** Number of channels in each filter. */ +{ + _weights_hwio = weights_hwio; + _output = output; + _matrix_stride = matrix_stride; + _num_output_channels = num_output_channels; + _num_input_channels = num_input_channels; + + const int matrix_row_stride = roundup(num_output_channels, WinogradConv::N_BLOCK); + WeightsTransform transform(nullptr, output, matrix_stride, matrix_row_stride, num_output_channels, num_input_channels); + Window win; + auto win_last = transform.get_window(); win.set(Window::DimX, Window::Dimension(0, win_last, 1)); INEKernel::configure(win); } @@ -351,9 +357,12 @@ void NEWinogradLayerTransformWeightsKernelrun(fst, lst); + + const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK); + WeightsTransform transform(reinterpret_cast(_weights_hwio->buffer()), _output, _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels); + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + transform.run(fst, lst); } template @@ -379,16 +388,16 @@ template class NEWinogradLayerTransformWeightsKernel; template unsigned int NEWinogradLayerTransformInputKernel::get_input_storage_size( - int n_batches, /** Number of batches in the input tensor. */ - int n_channels, /** Number of feature maps in the input tensor. */ - int n_rows, /** Number of rows in each feature map. */ - int n_cols, /** Number of columns in each feature map. */ - bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ + int num_batches, /* Number of batches in the input tensor. */ + int num_channels, /* Number of feature maps in the input tensor. */ + int num_rows, /* Number of rows in each feature map. */ + int num_cols, /* Number of columns in each feature map. */ + bool same_padding /* Use "SAME" padding, otherwise use "VALID". */ ) const { // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels); - const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels); + const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels); + const KernelShape kern_shape(1, KernelRows, KernelCols, num_channels); const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; // Return the size, converted into units of TIn return static_cast(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T)); @@ -403,25 +412,32 @@ int NEWinogradLayerTransformInputKernel NEWinogradLayerTransformInputKernel::NEWinogradLayerTransformInputKernel() - : _transform() + : _input_nhwc(), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0) { } template void NEWinogradLayerTransformInputKernel::configure( - const T *const input, /** Input tensor data */ - const int n_batches, /** Number of batches in input tensor. */ - const int n_rows, /** Number of rows in input tensor. */ - const int n_cols, /** Number of columns in input tensor. */ - const int n_channels, /** Number of channels in input tensor. */ - const PaddingType padding, /** Padding type. */ - T *const output, /** Base of output matrices. */ - const int matrix_stride) /** Stride between output matrices. */ -{ - // _input_matrix_row_stride(n_input_channels), - _transform = support::cpp14::make_unique(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels); - Window win; - auto win_last = _transform->get_window(); + const ITensor *input_nhwc, + const int num_batches, /* Number of batches in input tensor. */ + const int num_rows, /* Number of rows in input tensor. */ + const int num_cols, /* Number of columns in input tensor. */ + const int num_channels, /* Number of channels in input tensor. */ + const PaddingType padding, /* Padding type. */ + T *const output, /* Base of output matrices. */ + const int matrix_stride) /* Stride between output matrices. */ +{ + _input_nhwc = input_nhwc; + _num_batches = num_batches; + _num_rows = num_rows; + _num_cols = num_cols; + _num_channels = num_channels; + _padding = padding; + _output = output; + _matrix_stride = matrix_stride; + InputTransform transform(nullptr, num_batches, num_rows, num_cols, num_channels, padding, output, matrix_stride, num_channels); + Window win; + auto win_last = transform.get_window(); win.set(Window::DimX, Window::Dimension(0, win_last, 1)); INEKernel::configure(win); } @@ -431,9 +447,13 @@ void NEWinogradLayerTransformInputKernel(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, _output, _matrix_stride, _num_channels); + + // The code below cannot be moved to configure because biases hasn't been allocated at that point const size_t fst = window.x().start(); const size_t lst = window.x().end(); - _transform->run(fst, lst); + input_transform.run(fst, lst); } template @@ -453,16 +473,16 @@ template class NEWinogradLayerTransformInputKernel; template unsigned int NEWinogradLayerTransformOutputKernel::get_output_storage_size( - int n_batches, /** Number of batches in the output tensor. */ - int n_rows, /** Number of rows in each feature map of the input tensor. */ - int n_cols, /** Number of columns in each feature map of the input tensor. */ - int n_output_channels, /** Number of feature maps in the output tensor. */ - bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ + int num_batches, /* Number of batches in the output tensor. */ + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + int num_output_channels, /* Number of feature maps in the output tensor. */ + bool same_padding /* Use "SAME" padding, otherwise use "VALID". */ ) const { // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1); - const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1); + const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1); + const KernelShape kern_shape(num_output_channels, KernelRows, KernelCols, 1); const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID; // Return the size, converted into units of TOut @@ -472,7 +492,7 @@ unsigned int NEWinogradLayerTransformOutputKernel NEWinogradLayerTransformOutputKernel::NEWinogradLayerTransformOutputKernel() - : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0) + : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0) { } @@ -494,24 +514,24 @@ void NEWinogradLayerTransformOutputKernel(_biases->buffer()) : nullptr), _output, - _n_batches, _n_rows, _n_cols, _n_channels); + (_biases ? reinterpret_cast(_biases->buffer()) : nullptr), reinterpret_cast(_output_nhwc->buffer()), + _num_batches, _num_rows, _num_cols, _num_channels); // The code below cannot be moved to configure because biases hasn't been allocated at that point const size_t fst = window.x().start(); diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 1a9c72965b..d6bc5cfd9a 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -60,8 +60,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162 ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported"); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1287 ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); @@ -107,6 +107,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); } + } //namespace NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr memory_manager) @@ -218,33 +219,60 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * _output_nhwc.allocator()->init(info); _output_nhwc.allocator()->allocate(); - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); - _weights_hwio.allocator()->allocate(); - - // configure the kernel to transform the input tensor from NCHW -> NHWC - _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - _input_nhwc.allocator()->allocate(); - const KernelShape kernel_shape({ out_channels, static_cast(kernel_size.height), static_cast(kernel_size.width), in_channels }); // Configure the InputTransform const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type); - transform_input_kernel->configure(reinterpret_cast(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + + if(data_layout == DataLayout::NCHW) + { + // configure the kernel to transform the input tensor from NCHW -> NHWC + _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); + _input_nhwc.allocator()->allocate(); + transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, + reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + } + else + { + transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, + reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + } // Configure WeightsTransform const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + if(data_layout == DataLayout::NCHW) + { + // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] + _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); + + transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + } + else + { + // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] + _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U)); + + transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + } + _weights_hwio.allocator()->allocate(); // Configure OutputTransform //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type); const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type)); - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), - output_matrix_stride, reinterpret_cast(_output_nhwc.buffer()), - in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); + if(data_layout == DataLayout::NCHW) + { + transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + output_matrix_stride, &_output_nhwc, + in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); + } + else + { + transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + output_matrix_stride, _output, + in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); + } // Configure GEMM const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height); @@ -293,14 +321,16 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if(data_layout == DataLayout::NCHW && _is_activationlayer_enabled) { - _activationlayer_function.configure(output, nullptr, act_info); + _activationlayer_function.configure(_output, nullptr, act_info); } } void NEWinogradConvolutionLayer::run() { + const DataLayout data_layout = _input->info()->data_layout(); + _memory_group.acquire(); if(!_reshaped_kernel) { @@ -308,9 +338,12 @@ void NEWinogradConvolutionLayer::run() _permute_weights.run(); NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); } - //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - _permute_input.run(); + if(data_layout == DataLayout::NCHW) + { + //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC + _permute_input.run(); + } // Transform input tensor to the winograd domain NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX); @@ -320,8 +353,11 @@ void NEWinogradConvolutionLayer::run() // Transform output tensor to the spatial domain NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); - // Reorder the convoluted output to ACL's ordering NCHW - _permute_output.run(); + if(data_layout == DataLayout::NCHW) + { + // Reorder the convoluted output to ACL's ordering NCHW + _permute_output.run(); + } if(_is_activationlayer_enabled) { diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h index 6381b99131..f40f3d2e43 100644 --- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h @@ -55,7 +55,8 @@ class WinogradConvolutionLayerValidationFixture : public framework::Fixture { public: template - void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info) + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, + DataType data_type, ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(dilation); @@ -84,7 +85,7 @@ protected: } } - TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info, + TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, const PadStrideInfo &info, DataType data_type, ActivationLayerInfo act_info) { // Create tensors -- cgit v1.2.1