From e1553374d037dbf84999258d5bc88927891770cc Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 16 Jul 2018 18:53:52 +0100 Subject: COMPMID-1357: Stop passing around raw pointers in NEWinogradConvolution First step to allow us to enable the memory manager in this function Change-Id: Ic42fdac4c74cd21973c71130b59883e4a87d3dca Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140167 Reviewed-by: Pablo Tello Reviewed-by: Vidhya Sudhan Loganathan Tested-by: Jenkins --- .../kernels/NEWinogradConvolutionLayerKernel.h | 163 +++------------------ .../NEON/functions/NEWinogradConvolutionLayer.h | 2 +- docs/00_introduction.dox | 2 +- .../kernels/NEWinogradConvolutionLayerKernel.cpp | 125 ++-------------- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 84 ++++++----- 5 files changed, 81 insertions(+), 295 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h index 68c133ee37..9cdd69a70a 100644 --- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h @@ -74,7 +74,7 @@ public: * @param[in] matrix_stride Stride between output matrices. */ virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels, - const PaddingType padding, T *const output, const int matrix_stride) = 0; + const PaddingType padding, ITensor *output, const int matrix_stride) = 0; /** Destructor */ virtual ~INEWinogradLayerTransformInputKernel() @@ -152,7 +152,7 @@ public: const int num_cols, const int num_channels, const PaddingType padding, - T *const output, + ITensor *output, const int matrix_stride) override; // Inherited methods overridden: @@ -181,7 +181,7 @@ private: int _num_cols; /**< Number of columns in input tensor. */ int _num_channels; /**< Number of channels in input tensor. */ PaddingType _padding; /**< Padding type. */ - T *_output; /**< Base of output matrices. */ + ITensor *_output; /**< Base of output matrices. */ int _matrix_stride; /**< Stride between output matrices. */ }; @@ -236,9 +236,9 @@ public: */ virtual void configure( const ITensor *biases, - const T *const output_workingspace, + const ITensor *output_workingspace, const int matrix_stride, - ITensor *const output_nhwc, + ITensor *output_nhwc, const int num_batches, const int num_rows, const int num_cols, @@ -318,9 +318,9 @@ public: */ void configure( const ITensor *biases, - const T *const output_workingspace, + const ITensor *output_workingspace, const int matrix_stride, - ITensor *const output_nhwc, + ITensor *output_nhwc, const int num_batches, const int num_rows, const int num_cols, @@ -345,7 +345,7 @@ private: using OutputTransform = typename WinogradBase::template OutputTransform; const ITensor *_biases; - const T *_output_workspace; + const ITensor *_output_workspace; int _matrix_stride; int _matrix_row_stride; ITensor *_output_nhwc; @@ -379,14 +379,14 @@ public: /** Configure the weights transform kernel. * - * @param[in] weights_hwio Pointer to the weights tensor - * @param[in] output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Stride across matrices in the output workspace. - * @param[in] num_output_channels Number of filters. - * @param[in] num_input_channels Number of channels in each filter. + * @param[in] weights_hwio Pointer to the weights tensor + * @param[out] output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Stride across matrices in the output workspace. + * @param[in] num_output_channels Number of filters. + * @param[in] num_input_channels Number of channels in each filter. */ - virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; + virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; virtual ~INEWinogradLayerTransformWeightsKernel() { @@ -428,7 +428,7 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); // Inherited methods overridden: - void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; + void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override; int get_matrix_stride(const KernelShape &kernel_shape) const override; void run(const Window &window, const ThreadInfo &info) override; @@ -440,147 +440,26 @@ private: using WeightsTransform = typename WinogradBase::template WeightsTransform; const ITensor *_weights_hwio; - T *_output; + ITensor *_output; int _matrix_stride; int _num_output_channels; int _num_input_channels; }; -/** Interface for the NEON kernel to perform Winograd. */ -template -class INEWinogradLayerBatchedGEMMKernel : public INEKernel -{ -public: - /** Get the number of GEMMs to compute - */ - virtual unsigned int get_number_gemms() const = 0; - /** Initialise the kernel - * - * @param[in] n_gemms Number of GEMMs to compute. - * @param[in] M in_shape.num_batches * tile_rows * tile_cols. - * @param[in] K Number of channels in the input tensor. - * @param[in] N Number of channels in the output tensor. - * @param[in] a_matrix_stride Stride between input matrices. - * @param[in] a_row_stride Row stride inside input matrix. - * @param[in] b_matrix_stride Stride between weights matrices. - * @param[in] b_row_stride Row stride inside the weights matrix. - * @param[in] c_matrix_stride Stride between output matrices. - * @param[in] c_row_stride Row stride inside the output matrix. - * @param[out] a_ptr Input workspace. - * @param[out] b_ptr Kernel workspace. - * @param[out] c_ptr Output workspace. - */ - virtual void configure( - const unsigned int n_gemms, - const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const TIn *const a_ptr, - const TIn *const b_ptr, - TOut *const c_ptr) = 0; - - /** Get the number of tiles per row - */ - virtual int get_output_tile_rows() const = 0; - /** Get the number of tiles per columns - */ - virtual int get_output_tile_cols() const = 0; - /** Get the number of blocks - */ - virtual int get_number_blocks() const = 0; -}; - /** NEON kernel to perform Winograd. */ template -class NEWinogradLayerBatchedGEMMKernel : public INEWinogradLayerBatchedGEMMKernel +class NEWinogradLayerConfiguration { public: /** Winograd base kernel */ using WinogradBase = winograd::WinogradGEMM; /** Winograd convolution kernel */ - using WinogradConv = typename WinogradBase::template Convolution; - /** Winograd batched blocked GEMM operator */ - using MultiGEMM = winograd::BatchedBlockedGemm; - - const char *name() const override - { - return "NEWinogradLayerBatchedGEMMKernel"; - } - /** Constructor */ - NEWinogradLayerBatchedGEMMKernel(); - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerBatchedGEMMKernel(const NEWinogradLayerBatchedGEMMKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradLayerBatchedGEMMKernel &operator=(const NEWinogradLayerBatchedGEMMKernel &) = delete; - /** Allow instances of this class to be moved */ - NEWinogradLayerBatchedGEMMKernel(NEWinogradLayerBatchedGEMMKernel &&) = default; - /** Allow instances of this class to be moved */ - NEWinogradLayerBatchedGEMMKernel &operator=(NEWinogradLayerBatchedGEMMKernel &&) = default; - /** Default destructor. */ - ~NEWinogradLayerBatchedGEMMKernel() = default; - - // Inherited methods overridden: - - unsigned int get_number_gemms() const override; - int get_output_tile_rows() const override; - int get_output_tile_cols() const override; - int get_number_blocks() const override; - - /** Initialise the kernel - * - * @param[in] n_gemms Number of GEMMs to compute. - * @param[in] M in_shape.num_batches * tile_rows * tile_cols. - * @param[in] K Number of channels in the input tensor. - * @param[in] N Number of channels in the output tensor. - * @param[in] a_matrix_stride Stride between input matrices. - * @param[in] a_row_stride Row stride inside input matrix. - * @param[in] b_matrix_stride Stride between weights matrices. - * @param[in] b_row_stride Row stride inside the weights matrix. - * @param[in] c_matrix_stride Stride between output matrices. - * @param[in] c_row_stride Row stride inside the output matrix. - * @param[out] a_ptr Input workspace. - * @param[out] b_ptr Kernel workspace. - * @param[out] c_ptr Output workspace. - */ - void configure( - const unsigned int n_gemms, - const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const TIn *const a_ptr, - const TIn *const b_ptr, - TOut *const c_ptr) override; - void run(const Window &window, const ThreadInfo &info) override; - - /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerBatchedGEMMKernel. - * - * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F32 - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. - * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. - * @param[out] output Output tensor. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo()); + using WinogradConv = typename WinogradBase::template Convolution; -private: - static const int _output_tile_rows = OutputTileRows; - static const int _output_tile_cols = OutputTileCols; - std::unique_ptr _gemms; + using TransformInputKernel = NEWinogradLayerTransformInputKernel; + using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel; + using TransformOutputKernel = NEWinogradLayerTransformOutputKernel; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index d897ae00e7..384fbf893b 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -43,7 +43,7 @@ class ITensor; * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) * -# @ref NEWinogradLayerTransformInputKernel * -# @ref NEWinogradLayerTransformOutputKernel - * -# @ref NEWinogradLayerBatchedGEMMKernel + * -# @ref NEGEMMAssemblyDispatchF32 * -# @ref CPPPermute (three times: weights, input and output) * * @note Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 7f693c901c..4afd6d85f4 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -315,7 +315,7 @@ v18.02 Public major release - @ref NEWinogradLayerTransformInputKernel / NEWinogradLayer - @ref NEWinogradLayerTransformOutputKernel / NEWinogradLayer - @ref NEWinogradLayerTransformWeightsKernel / NEWinogradLayer - - Renamed NEWinogradLayerKernel into @ref NEWinogradLayerBatchedGEMMKernel + - Renamed NEWinogradLayerKernel into NEWinogradLayerBatchedGEMMKernel - New GLES kernels / functions: - @ref GCTensorShiftKernel / @ref GCTensorShift diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp index 50e69a8adf..b295a0c685 100644 --- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp @@ -40,38 +40,6 @@ namespace arm_compute namespace { -Status validate_arguments_winograd_gemm(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta, - const GEMMInfo &gemm_info = GEMMInfo()) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(b); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - if(c != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B"); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != a->num_dimensions()); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_UNUSED(alpha, beta); - return Status{}; -} - Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); @@ -232,78 +200,6 @@ std::pair validate_and_configure_window_winograd_output_trans(IT return std::make_pair(err, win); } } // namespace -template -NEWinogradLayerBatchedGEMMKernel::NEWinogradLayerBatchedGEMMKernel() - : _gemms() -{ -} - -template -void NEWinogradLayerBatchedGEMMKernel::configure( - const unsigned int n_gemms, - const int M, const int K, const int N, - const int a_matrix_stride, - const int a_row_stride, - const int b_matrix_stride, - const int b_row_stride, - const int c_matrix_stride, - const int c_row_stride, - const TIn *const a_ptr, - const TIn *const b_ptr, - TOut *const c_ptr) -{ - _gemms = support::cpp14::make_unique(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr); - Window win; - auto win_last = _gemms->get_window(); - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); - INEKernel::configure(win); -} - -template -void NEWinogradLayerBatchedGEMMKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - const size_t first_gemm = window.x().start(); - const size_t last_gemm = window.x().end(); - _gemms->run(first_gemm, last_gemm); -} - -template -unsigned int NEWinogradLayerBatchedGEMMKernel::get_number_gemms() const -{ - return WinogradBase::N_GEMMS; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_output_tile_rows() const -{ - return _output_tile_rows; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_output_tile_cols() const -{ - return _output_tile_cols; -} - -template -int NEWinogradLayerBatchedGEMMKernel::get_number_blocks() const -{ - return WinogradConv::N_BLOCK; -} - -template -Status NEWinogradLayerBatchedGEMMKernel::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, - const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_gemm(a, b, c, output, alpha, beta, gemm_info)); - return Status{}; -} - -template class NEWinogradLayerBatchedGEMMKernel; -template class NEWinogradLayerBatchedGEMMKernel; -template class NEWinogradLayerBatchedGEMMKernel; // Weights transform @@ -332,7 +228,7 @@ int NEWinogradLayerTransformWeightsKernel void NEWinogradLayerTransformWeightsKernel::configure( const ITensor *weights_hwio, - T *const output, + ITensor *output, const int matrix_stride, /** Stride across matrices in the output. */ const int num_output_channels, /** Number of filters. */ const int num_input_channels) /** Number of channels in each filter. */ @@ -344,7 +240,7 @@ void NEWinogradLayerTransformWeightsKernel(_weights_hwio->buffer()), _output, _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels); + WeightsTransform transform(reinterpret_cast(_weights_hwio->buffer()), reinterpret_cast(_output->buffer()), _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels); const size_t fst = window.x().start(); const size_t lst = window.x().end(); transform.run(fst, lst); @@ -423,7 +319,7 @@ void NEWinogradLayerTransformInputKernel(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, _output, _matrix_stride, _num_channels); + InputTransform input_transform(reinterpret_cast(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, reinterpret_cast(_output->buffer()), + _matrix_stride, _num_channels); // The code below cannot be moved to configure because biases hasn't been allocated at that point const size_t fst = window.x().start(); @@ -511,9 +408,9 @@ Tensor4DShape NEWinogradLayerTransformOutputKernel void NEWinogradLayerTransformOutputKernel::configure( const ITensor *biases, - const T *const output_workingspace, + const ITensor *output_workingspace, const int matrix_stride, - ITensor *const output_nhwc, + ITensor *output_nhwc, const int num_batches, const int num_rows, const int num_cols, @@ -529,7 +426,7 @@ void NEWinogradLayerTransformOutputKernel(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride, (_biases ? reinterpret_cast(_biases->buffer()) : nullptr), reinterpret_cast(_output_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T)); diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 29da0803a3..a71eade9a1 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -155,29 +155,32 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * { if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } else { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; } break; } case 5: { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + using config = NEWinogradLayerConfiguration; + transform_input_kernel = support::cpp14::make_unique(); + transform_weights_kernel = support::cpp14::make_unique(); + transform_output_kernel = support::cpp14::make_unique(); + n_gemms = config::WinogradBase::N_GEMMS; + N_BLOCK = config::WinogradConv::N_BLOCK; break; } default: @@ -195,21 +198,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int out_channels = output->info()->dimension(channel_idx); const Tensor4DShape in_shape(internal_get_input_shape(input)); + const DataType data_type = input->info()->data_type(); const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. constexpr size_t storage_alignment = 64; // Kernel Storage const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + in_channels) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ // Output storage const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, - use_same_padding) * data_type_size + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + use_same_padding) + * data_type_size + + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ ; const KernelShape kernel_shape({ out_channels, static_cast(kernel_size.height), static_cast(kernel_size.width), in_channels }); const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); @@ -229,28 +239,28 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const int output_matrix_row_stride = kernel_matrix_row_stride; TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(element_size_from_data_type(DataType::F32)); + Strides a_strides(data_type_size); a_strides.set(1, a_strides[0] * k); + //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. a_strides.set(2, 0); - //a_strides.set(2, element_size_from_data_type(DataType::F32) * input_matrix_stride / n_gemms); - a_strides.set(3, element_size_from_data_type(DataType::F32) * input_matrix_stride); + a_strides.set(3, data_type_size * input_matrix_stride); TensorShape b_shape(n, k, n_gemms); - Strides b_strides(element_size_from_data_type(DataType::F32)); - b_strides.set(1, element_size_from_data_type(DataType::F32) * kernel_matrix_row_stride); - b_strides.set(2, element_size_from_data_type(DataType::F32) * kernel_matrix_stride); + Strides b_strides(data_type_size); + b_strides.set(1, data_type_size * kernel_matrix_row_stride); + b_strides.set(2, data_type_size * kernel_matrix_stride); TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(element_size_from_data_type(DataType::F32)); - d_strides.set(1, element_size_from_data_type(DataType::F32) * output_matrix_row_stride); + Strides d_strides(data_type_size); + d_strides.set(1, data_type_size * output_matrix_row_stride); + //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. d_strides.set(2, 0); - //d_strides.set(2, element_size_from_data_type(DataType::F32) * output_matrix_stride / n_gemms); - d_strides.set(3, element_size_from_data_type(DataType::F32) * output_matrix_stride); + d_strides.set(3, data_type_size * output_matrix_stride); TensorInfo a_info, b_info, d_info; - a_info.init(a_shape, 1, DataType::F32, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, DataType::F32, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, DataType::F32, d_strides, 0, output_storage_size); + a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); + b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); + d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); _input_workspace.allocator()->init(a_info, storage_alignment); _input_workspace.allocator()->allocate(); @@ -276,12 +286,12 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); _input_nhwc.allocator()->allocate(); transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } else { transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - reinterpret_cast(_input_workspace.buffer()), input_matrix_stride); + &_input_workspace, input_matrix_stride); } // Configure WeightsTransform @@ -290,14 +300,14 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } else { // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U)); - transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); } _weights_hwio.allocator()->allocate(); @@ -306,13 +316,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * if(data_layout == DataLayout::NCHW) { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, &_output_nhwc, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } else { - transform_output_kernel->configure(biases, reinterpret_cast(_output_workspace.buffer()), + transform_output_kernel->configure(biases, &_output_workspace, output_matrix_stride, _output, in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); } -- cgit v1.2.1