diff options
author | Pablo Tello <pablo.tello@arm.com> | 2018-01-23 09:36:04 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:45:00 +0000 |
commit | d6ca478a7e410f8f529c2e505305b46d9fe21a9b (patch) | |
tree | 5c50c06e07f812890f127b1c4933996987f74f17 /arm_compute | |
parent | d05dce46a14a7b67f322328ecd95bf96bdd30bae (diff) | |
download | ComputeLibrary-d6ca478a7e410f8f529c2e505305b46d9fe21a9b.tar.gz |
COMPMID-784: Added support for biases in WinogradLayer.
1) Updated to the latest code from the RSH repo.
2) Moved winograd transforms into kernels.
3) Added support for biases
Change-Id: I7f39f34a599b49d7d9b549cc10a4f4d4a8007ab8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117474
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute')
7 files changed, 194 insertions, 93 deletions
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h index b85f372896..ea6c8d813d 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -32,6 +32,8 @@ namespace arm_compute { class ITensor; class NEWinogradLayerKernel; +class NEWinogradLayerTransformInputKernel; +class NEWinogradLayerTransformWeightsKernel; class Winograd3x3F32 final { @@ -48,10 +50,15 @@ public: * @param[out] weights_storage Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size * @param[in] input Pointer to NHWC ordered input tensor, in the spatial domain. * @param[out] winograd_input Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. + * @param[in] biases Pointer to the biases vector. * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. * @param[out] winograd_output Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ friend class NEWinogradLayerKernel; + friend class NEWinogradLayerTransformInputKernel; + friend class NEWinogradLayerTransformOutputKernel; + friend class NEWinogradLayerTransformWeightsKernel; + Winograd3x3F32( const int n_batches, const int n_input_channels, @@ -67,16 +74,124 @@ public: float *const winograd_output); ~Winograd3x3F32(); - void transform_weights(); - void transform_input(); - void transform_output(); private: class Private; std::unique_ptr<Private> _pimpl; }; -class NEWinogradLayerKernel : public INEKernel +class INEWinogradLayerTransformKernel : public INEKernel +{ +public: + /** Constructor */ + INEWinogradLayerTransformKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEWinogradLayerTransformKernel(const INEWinogradLayerTransformKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEWinogradLayerTransformKernel &operator=(const INEWinogradLayerTransformKernel &) = delete; + /** Allow instances of this class to be moved */ + INEWinogradLayerTransformKernel(INEWinogradLayerTransformKernel &&) = default; + /** Allow instances of this class to be moved */ + INEWinogradLayerTransformKernel &operator=(INEWinogradLayerTransformKernel &&) = default; + + virtual ~INEWinogradLayerTransformKernel() = default; + + /** Initialise the kernel + * + * @param[in] convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS . + */ + virtual void configure(Winograd3x3F32 *convolver); + +protected: + Winograd3x3F32 *_convolver; +}; + +class NEWinogradLayerTransformInputKernel final : public INEWinogradLayerTransformKernel +{ +public: + const char *name() const override + { + return "NEWinogradLayerTransformInputKernel"; + } + // Inherited methods overridden: + void configure(Winograd3x3F32 *convolver) override; + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; +}; + +class NEWinogradLayerTransformOutputKernel final : public INEKernel +{ +public: + const char *name() const override + { + return "NEWinogradLayerTransformOutputKernel"; + } + /** Constructor */ + NEWinogradLayerTransformOutputKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default; + /** Allow instances of this class to be moved */ + NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default; + + ~NEWinogradLayerTransformOutputKernel() = default; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] output_workingspace Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() + * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. + * @param[in] n_batches Number of batches in the input tensor. + * @param[in] n_rows Number of rows in output tensor. + * @param[in] n_cols Number of columns in output tensor. + * @param[in] n_channels Number of feature maps in the output tensor. + */ + void configure( + const ITensor *biases, + const float *const output_workingspace, + const int matrix_stride, + float *const output, + const int n_batches, + const int n_rows, + const int n_cols, + const int n_channels); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; + +private: + const ITensor *_biases; + const float *_output_workspace; + int _matrix_stride; + int _matrix_row_stride; + float *_output; + int _n_batches; + int _n_rows; + int _n_cols; + int _n_channels; +}; + +class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformKernel +{ +public: + const char *name() const override + { + return "NEWinogradLayerTransformWeightsKernel"; + } + // Inherited methods overridden: + void configure(Winograd3x3F32 *convolver) override; + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; +}; + +class NEWinogradLayerKernel final : public INEKernel { public: const char *name() const override @@ -95,7 +210,7 @@ public: /** Allow instances of this class to be moved */ NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default; - virtual ~NEWinogradLayerKernel() = default; + ~NEWinogradLayerKernel() = default; /** Initialise the kernel * diff --git a/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp b/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp index 725f6cab65..6a9984a24a 100644 --- a/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp +++ b/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp @@ -29,6 +29,7 @@ void direct_convolution( const Tensor4D<Tensor4DShape, float>& input, const Tensor4D<KernelShape, float>& kernel, + const Tensor4D<Tensor4DShape, float>& biases, Tensor4D<Tensor4DShape, float>& output, const PaddingType padding ); diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp index 39b444184e..075765a513 100644 --- a/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp +++ b/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp @@ -71,7 +71,7 @@ namespace winograd const int row_offset = (tile_i == 0) ? 0 : ((padding_type == PADDING_VALID) ? 0 : 1); const T* const input_base_row = ( - input_base_batch + ((inner_tile_rows - 2)*tile_i - row_offset)*input_row_stride + input_base_batch + ((inner_tile_rows - (kernel_rows - 1))*tile_i - row_offset)*input_row_stride ); T* const outptr_base_row = outptr_base_batch + tile_i*output_row_stride; diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp index 7fa5ee9617..0dd719751b 100644 --- a/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp +++ b/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp @@ -35,6 +35,7 @@ namespace winograd const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, + const T* const biases, T* const output ) { @@ -69,8 +70,9 @@ namespace winograd // Process the row process_tile_row( tile_N, output_shape.n_channels, matrix_tile_row, matrix_stride, - matrix_row_stride, outptr_row, output_row_stride, - output_col_stride, row_pad_bottom, pad_right + matrix_row_stride, biases, + outptr_row, output_row_stride, output_col_stride, row_pad_bottom, + pad_right ); } } @@ -85,6 +87,7 @@ namespace winograd const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, + const T* const biases, T* const output, const int output_row_stride, const int output_col_stride, @@ -102,7 +105,7 @@ namespace winograd // Perform the output transformation tile_fns[row_pad_bottom][tile_pad_right]( - n_channels, matrix_row, matrix_stride, + n_channels, matrix_row, matrix_stride, biases, outptr, output_row_stride, output_col_stride ); } @@ -131,14 +134,17 @@ namespace winograd const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, + const T* const biases, T* const output, const int n_batches, const int n_rows, const int n_cols, const int n_channels - ) : _matrix_base(matrix_base), _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride), - _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels), - _tile_M(iceildiv(n_rows, output_tile_rows)), _tile_N(iceildiv(n_cols, output_tile_cols)) + ) : _matrix_base(matrix_base), _biases(biases), + _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride), + _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), + _n_channels(n_channels), _tile_M(iceildiv(n_rows, output_tile_rows)), + _tile_N(iceildiv(n_cols, output_tile_cols)) { } @@ -168,7 +174,8 @@ namespace winograd _n_batches, _n_rows, _n_cols, _n_channels, NHWC }; execute( - output_shape, _matrix_base, _matrix_stride, _matrix_row_stride, _outptr + output_shape, _matrix_base, _matrix_stride, _matrix_row_stride, _biases, + _outptr ); } } // namespace winograd diff --git a/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp b/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp index adca48a6d6..2ea70f182b 100644 --- a/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp +++ b/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp @@ -183,7 +183,7 @@ class WinogradGEMM const int row_pad_top, const int row_pad_left, const int row_pad_bottom, - const int row_pad_right + const int n_cols ); static constexpr int max_pad_bottom = inner_tile_rows - 1; @@ -225,6 +225,7 @@ class WinogradGEMM const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, + const T* const biases, T* const output ); @@ -236,6 +237,7 @@ class WinogradGEMM const T* const matrix_base, /** Pointer to base of matrices. */ const int matrix_stride, /** Stride between matrices. */ const int matrix_row_stride, /** Stride within a matrix. */ + const T* const biases, /** Pointer to biases vector. */ T* const output, /** Pointer to output tensor. */ const int n_batches, /** Number of batches in output tensor. */ const int n_rows, /** Number of rows in output tensor. */ @@ -257,6 +259,7 @@ class WinogradGEMM const T* const matrix_base, const int matrix_stride, const int matrix_row_stride, + const T* const biases, T* const output, const int output_row_stride, const int output_col_stride, @@ -270,14 +273,15 @@ class WinogradGEMM /** Prepare a single tile of the output tensor. */ template <int pad_bottom, int pad_right> - static void process_tile(int, const T*, int, T*, int, int); + static void process_tile(int, const T*, int, const T*, T*, int, int); // Array of methods to produce tiles of output tensor. - typedef void (*TileFn)(int, const T*, int, T*, int, int); + typedef void (*TileFn)(int, const T*, int, const T*, T*, int, int); static const TileFn tile_fns[max_pad_bottom][max_pad_right]; /** Member constants for instances of the transform. */ const T* const _matrix_base; + const T* const _biases; const int _matrix_stride, _matrix_row_stride; T* const _outptr; const int _n_batches, _n_rows, _n_cols, _n_channels, _tile_M, _tile_N; @@ -328,6 +332,7 @@ class WinogradGEMM void execute( TOut* const output, const TIn* const input, + const TOut* const biases, void* working_space=NULL, const int n_threads=1 ); @@ -336,6 +341,7 @@ class WinogradGEMM void execute( TOut* const output, const TIn* const input, + const TOut* const biases, const int n_threads ); diff --git a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp index 4559312df4..1db63d750b 100644 --- a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp +++ b/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp @@ -74,87 +74,56 @@ class WinogradConvolutionLayer /** Determine how much memory (in units of TIn) to allocate for the * transformed weights. - * - * @param[in] n_output_channels Number of output feature maps. - * @param[in] n_input_channels Number of input feature maps. */ static unsigned int get_weight_storage_size( - const int n_output_channels, - const int n_input_channels + const int n_output_channels, /** Number of output feature maps. */ + const int n_input_channels /** Number of input feature maps. */ ); /** Determine how much memory (in units of TIn) to allocate for the * transformed input. - * - * @param[in] n_batches Number of batches in the input tensor. - * @param[in] n_channels Number of feature maps in the input tensor. - * @param[in] n_rows Number of rows in each feature map. - * @param[in] n_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ static unsigned int get_input_storage_size( - const int n_batches, - const int n_channels, - const int n_rows, - const int n_cols, - const bool same_padding + const int n_batches, /** Number of batches in the input tensor. */ + const int n_channels, /** Number of feature maps in the input tensor. */ + const int n_rows, /** Number of rows in each feature map. */ + const int n_cols, /** Number of columns in each feature map. */ + const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ ); /** Determine how much memory (in units of TOut) to allocate for the * (Winograd domain) output. - * - * @param[in] n_batches Number of batches in the output tensor. - * @param[in] n_rows Number of rows in each feature map of the input tensor. - * @param[in] n_cols Number of columns in each feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". */ static unsigned int get_output_storage_size( - const int n_batches, - const int n_rows, - const int n_cols, - const int n_output_channels, - const bool same_padding + const int n_batches, /** Number of batches in the output tensor. */ + const int n_rows, /** Number of rows in each feature map of the input tensor. */ + const int n_cols, /** Number of columns in each feature map of the input tensor. */ + const int n_output_channels, /** Number of feature maps in the output tensor. */ + const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ ); - /** Get the shape (rows, cols) of a feature map of the output tensor. - * - * @param[in] n_input_rows Number of rows in the input feature map. - * @param[in] n_input_cols Number of columns in the input feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - */ + /** Get the shape (rows, cols) of a feature map of the output tensor. */ static std::pair<int, int> get_output_feature_map_shape( - const int n_input_rows, - const int n_input_cols, - const bool same_padding + const int n_input_rows, /** Number of rows in the input feature map. */ + const int n_input_cols, /** Number of columns in the input feature map. */ + const bool same_padding /** Use "SAME" padding, otherwise use "VALID". */ ); /** Create a new Winograd convolution layer. - * @param[in] n_batches Number of batches in the input and output tensors. - * @param[in] n_input_channels Number of feature maps in a batch of the input tensor. - * @param[in] n_input_rows Number of rows in a feature map of the input tensor. - * @param[in] n_input_cols Number of columns in a feature map of the input tensor. - * @param[in] n_output_channels Number of feature maps in the output tensor. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * @param[in] weights Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. - * @param[out] weights_storage Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size - * @param[in] input Pointer to NHWC ordered input tensor, in the spatial domain. - * @param[out] winograd_input Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. - * @param[out] output Pointer to NHWC ordered output tensor, in the spatial domain. - * @param[out] winograd_output Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ WinogradConvolutionLayer( - const int n_batches, - const int n_input_channels, - const int n_input_rows, - const int n_input_cols, - const int n_output_channels, - const bool same_padding, - const TIn* const weights, - TIn* const weights_storage, - const TIn* const input, - TIn* const winograd_input, - TOut* const output, - TOut* const winograd_output + const int n_batches, /** Number of batches in the input and output tensors. */ + const int n_input_channels, /** Number of feature maps in a batch of the input tensor. */ + const int n_input_rows, /** Number of rows in a feature map of the input tensor. */ + const int n_input_cols, /** Number of columns in a feature map of the input tensor. */ + const int n_output_channels, /** Number of feature maps in the output tensor. */ + const bool same_padding, /** Use "SAME" padding, otherwise use "VALID". */ + const TIn* const weights, /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */ + TIn* const weights_storage, /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */ + const TIn* const input, /** Pointer to NHWC ordered input tensor, in the spatial domain. */ + TIn* const winograd_input, /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */ + const TOut* const biases, /** Pointer to biases vector. */ + TOut* const output, /** Pointer to NHWC ordered output tensor, in the spatial domain. */ + TOut* const winograd_output /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */ ); }; diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h index 60cdc97469..1682495f0d 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h @@ -52,7 +52,7 @@ public: * Data types supported: F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. * Currently only 3x3 kernels are supported. - * @param[in] biases Not supported, biases will be ignored. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. @@ -68,22 +68,25 @@ public: NEWinogradLayer &operator=(const NEWinogradLayer &) = delete; private: - MemoryGroup _memory_group; - NEWinogradLayerKernel _winograd_kernel; - CPPPermute _permute_input; - CPPPermute _permute_weights; - CPPPermute _permute_output; - Tensor _input_workspace; - Tensor _output_workspace; - Tensor _kernel_storage; - Tensor _input_nhwc; - Tensor _output_nhwc; - Tensor _weights_hwio; - const ITensor *_input; - const ITensor *_weights; - ITensor *_output; - bool _reshaped_kernel; - std::unique_ptr<Winograd3x3F32> _conv; + MemoryGroup _memory_group; + NEWinogradLayerKernel _winograd_kernel; + NEWinogradLayerTransformInputKernel _transform_input_kernel; + NEWinogradLayerTransformOutputKernel _transform_output_kernel; + NEWinogradLayerTransformWeightsKernel _transform_weights_kernel; + CPPPermute _permute_input; + CPPPermute _permute_weights; + CPPPermute _permute_output; + Tensor _input_workspace; + Tensor _output_workspace; + Tensor _kernel_storage; + Tensor _input_nhwc; + Tensor _output_nhwc; + Tensor _weights_hwio; + const ITensor *_input; + const ITensor *_weights; + ITensor *_output; + bool _reshaped_kernel; + std::unique_ptr<Winograd3x3F32> _conv; }; } #endif /* __ARM_COMPUTE_NEWINOGRADLAYER_H__ */ |