From 5264b7d5555ec980f9c52c719122479d0d676af8 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Mon, 21 Oct 2019 14:25:41 +0100 Subject: COMPMID-2576: Fuse activation in Winograd output transform. Change-Id: I26dd1307847adeaaefae0a7374b9858c07d71372 Signed-off-by: Pablo Tello Reviewed-on: https://review.mlplatform.org/c/2172 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- .../kernels/NEWinogradConvolutionLayerKernel.h | 126 ++++++++++++--------- .../NEON/kernels/convolution/winograd/winograd.hpp | 105 +++++++++-------- .../convolution/winograd/winograd_layer.hpp | 36 +++--- 3 files changed, 148 insertions(+), 119 deletions(-) (limited to 'arm_compute/core') diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h index f6b189cb1c..962037dd4f 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h @@ -64,13 +64,15 @@ public: /** Gets the stride between matrices in the input worspace * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] input_shape The shape of the input tensor. - * @param[in] padding_type The type of padding to be used. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Stride expressed in bytes. */ - virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; /** Configure the output transform kernel. * @@ -141,13 +143,20 @@ public: /** Gets the stride between matrices in the input worspace * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] input_shape The shape of the input tensor. - * @param[in] padding_type The type of padding to be used. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Stride expressed in bytes. */ - int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; + int get_matrix_stride( + int num_batches, + int num_channels, + int num_rows, + int num_cols, + bool same_padding) const override; /** Default constructor */ NEWinogradLayerTransformInputKernel(); @@ -241,31 +250,35 @@ public: * @param[in] num_rows Number of rows in each feature map of the input tensor. * @param[in] num_cols Number of columns in each feature map of the input tensor. * @param[in] num_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TOut) required. */ - virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const = 0; + virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; /** Gets the stride between matrices in the output worspace * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] input_shape The shape of the input tensor. - * @param[in] padding_type The type of padding to be used. + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. * * @return Stride expressed in bytes. */ - virtual int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const = 0; + virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; /** Get the output shape of a convolution. * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] in_shape The shape of the input tensor. - * @param[in] padding The type of padding to be used. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] padding_same True if padding is SAME, false otherwise * - * @return Stride expressed in bytes. + * @return Shape of the output tensor */ - virtual Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const = 0; + virtual std::pair get_output_shape( + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + bool padding_same /* True if padding is SAME, false otherwise */ + ) const = 0; /** Configure the output transform kernel. * @@ -278,17 +291,19 @@ public: * @param[in] num_cols Number of columns in output tensor. * @param[in] num_channels Number of feature maps in the output tensor. * @param[in] workspace Tensor to be used as the working space during the computation. + * @param[in] activation Activation to be used */ virtual void configure( - const ITensor *biases, - const ITensor *transformed_output, - const int matrix_stride, - ITensor *output_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - ITensor *workspace) = 0; + const ITensor *biases, + const ITensor *transformed_output, + const int matrix_stride, + ITensor *output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + ITensor *workspace, + const arm_gemm::Activation &activation) = 0; virtual ~INEWinogradLayerTransformOutputKernel() { @@ -326,30 +341,33 @@ public: * @param[in] num_rows Number of rows in each feature map of the input tensor. * @param[in] num_cols Number of columns in each feature map of the input tensor. * @param[in] num_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". * * @return Storage size (in units of TOut) required. */ - unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const override; + unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; /** Gets the stride between matrices in the output worspace * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] input_shape The shape of the input tensor. - * @param[in] padding_type The type of padding to be used. + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. * * @return Stride expressed in bytes. */ - int get_matrix_stride(const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const override; + int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; /** Get the output shape of a convolution. * - * @param[in] kernel_shape The shape of the weights tensor. - * @param[in] in_shape The shape of the input tensor. - * @param[in] padding The type of padding to be used. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] padding_same True if padding is SAME, false otherwise * - * @return Stride expressed in bytes. + * @return Shape of the output tensor */ - Tensor4DShape get_output_shape(const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const override; + std::pair get_output_shape( + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + bool padding_same) const override; /** Get the working space required to perform the transformation. * @@ -374,17 +392,19 @@ public: * @param[in] num_cols Number of columns in output tensor. * @param[in] num_channels Number of feature maps in the output tensor. * @param[in] workspace Tensor to be used as the working space during the computation. + * @param[in] activation Activation to be used */ void configure( - const ITensor *biases, - const ITensor *transformed_output, - const int matrix_stride, - ITensor *output_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - ITensor *workspace) override; + const ITensor *biases, + const ITensor *transformed_output, + const int matrix_stride, + ITensor *output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + ITensor *workspace, + const arm_gemm::Activation &activation) override; void run(const Window &window, const ThreadInfo &info) override; @@ -448,11 +468,12 @@ public: virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0; /** Gets the stride between matrices in the kernel worspace * - * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. * * @return Stride expressed in bytes. */ - virtual int get_matrix_stride(const KernelShape &kernel_shape) const = 0; + virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0; /** Configure the weights transform kernel. * @@ -535,11 +556,12 @@ public: /** Gets the stride between matrices in the input worspace * - * @param[in] kernel_shape The shape of the weights tensor. + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. * * @return Stride expressed in bytes. */ - int get_matrix_stride(const KernelShape &kernel_shape) const override; + int get_matrix_stride(int num_output_channels, int num_input_channels) const override; void run(const Window &window, const ThreadInfo &info) override; bool is_parallelisable() const override; diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp index 183c9c1061..bc0d9d4296 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,10 @@ #pragma once -#include "convolution.hpp" -#include "tensor.hpp" -#include "utils.hpp" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" + +#include +#include namespace winograd { @@ -308,7 +309,8 @@ class OutputTransform : public IOutputTransform int n_batches, /**< Number of batches in output tensor. */ int n_rows, /**< Number of rows in output tensor. */ int n_cols, /**< Number of columns in output tensor. */ - int n_channels /**< Number of channels in output tensor. */ + int n_channels, /**< Number of channels in output tensor. */ + const arm_gemm::Activation &activation ); OutputTransform(OutputTransform&) = delete; @@ -344,6 +346,7 @@ class OutputTransform : public IOutputTransform static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1; const int _n_batches, _n_rows, _n_cols, _n_channels; + const TOut _output_min, _output_max; private: void transform_uncropped_tile( @@ -372,7 +375,9 @@ class OutputTransform : public IOutputTransform const TOut* biases, TOut* output, int output_row_stride, - int output_col_stride + int output_col_stride, + TOut output_min, + TOut output_max ); /** Get the working space for a thread. */ @@ -405,7 +410,8 @@ class OutputTransform : int n_batches, /**< Number of batches in output tensor. */ int n_rows, /**< Number of rows in output tensor. */ int n_cols, /**< Number of columns in output tensor. */ - int n_channels /**< Number of channels in output tensor. */ + int n_channels, /**< Number of channels in output tensor. */ + const arm_gemm::Activation &activation ); /** Set pointers to the output tensor written by the transform. */ @@ -528,79 +534,84 @@ class WinogradGEMM typedef TIn InputType; /** Get the output shape of a convolution. */ - static Tensor4DShape get_output_shape( - const KernelShape &kernel_shape, - const Tensor4DShape &in_shape, - const PaddingType padding - ); - - /* Get the memory required to transform the kernel. - */ - static size_t get_kernel_transform_working_size(const KernelShape &shape); + static std::pair get_output_shape( + const std::pair input_shape, + bool padding_same); /** Get the memory required to store the kernel transformed into the * Winograd domain. */ - static size_t get_kernel_storage_size(const KernelShape &shape); + static size_t get_kernel_storage_size(unsigned int n_input_channels, + unsigned int n_output_channels); /** Get the memory required to store the input tensor transformed into * the Winograd domain. */ static size_t get_input_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); /** Get the memory required to store the output tensor in the Winograd * domain. */ static size_t get_output_storage_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); /** Get the memory required to apply a Winograd operator to some input. */ static size_t get_working_space_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_input_channels, // Number of input channels + unsigned int n_output_channels, // Number of output channels + bool padding_same); /* Get the memory required by a single "input" matrix. */ static size_t get_input_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); static int get_input_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of input rows + unsigned int n_cols, // Number of input columns + unsigned int n_channels, // Number of input channels + bool padding_same); /* Get the memory required by a single "output" matrix. */ static size_t get_output_matrix_size( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); static int get_output_matrix_stride( - const KernelShape &kernel_shape, - const Tensor4DShape &input_shape, - const PaddingType padding_type - ); + unsigned int n_batches, // Number of batches + unsigned int n_rows, // Number of output rows + unsigned int n_cols, // Number of output columns + unsigned int n_channels // Number of output channels + ); /* Get the memory required by a single "kernel" matrix. */ - static size_t get_kernel_matrix_size(const KernelShape &shape); - static int get_kernel_matrix_stride(const KernelShape &shape); + static size_t get_kernel_matrix_size(unsigned int n_input_channels, + unsigned int n_output_channels); + static int get_kernel_matrix_stride(unsigned int n_input_channels, + unsigned int n_output_channels); static constexpr int M_BLOCK = 4; /** Size of block used by GEMM. */ static constexpr int N_BLOCK = 16; /** Size of block used by GEMM. */ diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp index 9d418bebb4..ed8fede385 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,9 +23,6 @@ */ #pragma once - -#include - #include "arm_gemm_local.hpp" #include "arm_gemm.hpp" #include "winograd.hpp" @@ -42,8 +39,8 @@ class IWinogradConvolutionLayer virtual unsigned int weight_transform_get_window(void) const = 0; virtual void weight_transform_run(unsigned int start, unsigned int stop) = 0; - virtual ITransform& input_transform(void) = 0; // Expose the input transform - virtual ITransform& output_transform(void) = 0; // Expose the output transform + virtual IInputTransform& input_transform(void) = 0; // Expose the input transform + virtual IOutputTransform& output_transform(void) = 0; // Expose the output transform virtual arm_gemm::IGemmCommon *gemm(void) = 0; // Expose the underlying GEMM }; @@ -65,15 +62,18 @@ template class WinogradConvolutionLayer : public IWinogradConvolutionLayer { + public: + using WinogradBase = winograd::WinogradGEMM; + using WeightsTransform = typename WinogradBase::template WeightsTransform; + using InputTransform = typename WinogradBase::template InputTransform; + using WinogradConv = typename WinogradBase::template Convolution; + using OutputTransform = typename WinogradBase::template OutputTransform; + private: static constexpr int InnerTileRows = OutputTileRows + KernelRows - 1; static constexpr int InnerTileCols = OutputTileCols + KernelCols - 1; static constexpr int N_GEMMS = InnerTileRows * InnerTileCols; - const KernelShape _kernel_shape; - const Tensor4DShape _input_shape; - const PaddingType _padding; - const Tensor4DShape _output_shape; const int _n_output_rows, _n_output_cols; const int _kernel_matrix_stride, _kernel_matrix_row_stride; const int _input_matrix_stride, _input_matrix_row_stride; @@ -81,19 +81,14 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer const int _tile_rows, _tile_cols; const int _m, _k, _n; - public: - using WinogradBase = winograd::WinogradGEMM; - using WeightsTransform = typename WinogradBase::template WeightsTransform; - using InputTransform = typename WinogradBase::template InputTransform; - using WinogradConv = typename WinogradBase::template Convolution; - using OutputTransform = typename WinogradBase::template OutputTransform; - - /* Public member variables. */ WeightsTransform weights_transform; /** Operator to transform weights to Winograd domain. */ InputTransform _input_transform; /** Operator to transform input to Winograd domain. */ + const arm_gemm::GemmArgs gemm_args; arm_gemm::UniqueGemmCommon gemms; /** Operator to perform multiple GEMMs. */ OutputTransform _output_transform; /** Operator to transform output from Winograd domain. */ + public: + /** Determine how much memory (in units of TIn) to allocate for the * transformed weights. */ @@ -186,6 +181,7 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ const int n_output_channels, /** Number of feature maps in the output tensor. */ const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ + const arm_gemm::Activation &activation, const TIn* const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ TInGEMM* const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ const TIn* const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ @@ -201,8 +197,8 @@ class WinogradConvolutionLayer : public IWinogradConvolutionLayer unsigned int weight_transform_get_window(void) const; void weight_transform_run(const unsigned int start, const unsigned int stop); - ITransform& input_transform(void); - ITransform& output_transform(void); + IInputTransform& input_transform(void); + IOutputTransform& output_transform(void); /* Get a pointer to the GEMM underlying the Winograd transform. */ arm_gemm::IGemmCommon *gemm(void); -- cgit v1.2.1