From a3b1b469276b10484cd45901ae3a4b48b506caa9 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 16 Nov 2017 19:24:39 +0000 Subject: COMPMID-667: Add validation static method to NEON GEMMlowp Change-Id: I8a470cc1351593ad8eeaf4ec92e04865e83d4f3c Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96147 Tested-by: Kaizen Reviewed-by: Anthony Barbier --- arm_compute/core/ITensorInfo.h | 4 +- .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 12 +- .../NEON/kernels/NEGEMMInterleaveBlockedKernel.h | 11 ++ .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h | 11 +- .../kernels/NEGEMMLowpOffsetContributionKernel.h | 11 ++ ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h | 11 ++ .../core/NEON/kernels/NEGEMMLowpReductionKernel.h | 20 +++ .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h | 10 +- .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h | 10 ++ arm_compute/core/SubTensorInfo.h | 3 +- arm_compute/core/TensorInfo.h | 3 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 9 ++ .../runtime/NEON/functions/NEGEMMLowpOutputStage.h | 11 ++ .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 76 ++++++++--- .../NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp | 92 +++++++++---- .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 79 +++++++---- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 151 ++++++++++++--------- ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 101 +++++++++----- .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 107 +++++++++++---- src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 92 +++++++++---- .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp | 60 ++++++-- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 88 +++++++++++- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 5 + tests/validation/NEON/GEMMLowp.cpp | 33 +++++ 24 files changed, 765 insertions(+), 245 deletions(-) diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h index 1bc0a80bac..4f69442b48 100644 --- a/arm_compute/core/ITensorInfo.h +++ b/arm_compute/core/ITensorInfo.h @@ -201,8 +201,10 @@ public: /** Set the flag whether the tensor size can be changed. * * @param[in] is_resizable Flag that marks the tensor if it can be changed or not. + * + * @return Reference to this ITensorInfo object */ - virtual void set_is_resizable(bool is_resizable) = 0; + virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0; /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined. * * @return The valid region. diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 1c0d85c27b..db719caccb 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -56,10 +56,18 @@ public: NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. + * + * @return an error status + */ + static Error validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -67,7 +75,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h index cdeb11d606..1a5b0fb863 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h @@ -50,6 +50,17 @@ public: * @param[in] transpose True if transpose operation must be performed, false otherwise. */ void configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleaveBlockedKernel + * + * @param[in] input Input tensor. Data types supported: U8 + * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. + * @param[in] block_height The height of the blocks to be interleaved. + * @param[in] block_width The width of the blocks to be interleaved. + * @param[in] transpose True if transpose operation must be performed, false otherwise. + * + * @return an error status + */ + Error validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h index e9bfe4ea07..d9986b6cdd 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h @@ -58,11 +58,20 @@ public: * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two * kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: ASYMM8 + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8 * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 */ void configure(const ITensor *input0, const ITensor *input1, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel + * + * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32 + * + * @return an error status + */ + static Error validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h index 8c1bae9396..27cb3f2c1c 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h @@ -68,6 +68,17 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. */ void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel + * + * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + static Error validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h index 4ec0e9df93..654dee21af 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -70,6 +70,17 @@ public: * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index 6eee54a9f0..9ca5cdf828 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -77,6 +77,16 @@ public: * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 */ void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] num_mtx_a_cols Number of matrix A columns + * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 + * + * @return an error status + */ + static Error validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -98,6 +108,16 @@ public: * @param[in] is_transposed1xW True if the input tensor is transposed 1xW */ void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] num_mtx_b_rows Number of matrix B rows + * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + * + * @return an error status + */ + static Error validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h index 4d0bb2a484..4436d1fdb0 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -70,10 +70,18 @@ class NEGEMMTranspose1xWKernel : public INESimpleKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor info. Data type supported: same as @p input. + * + * @return an error status + */ + static Error validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h index 26af626aaa..66684a1185 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h @@ -39,6 +39,16 @@ class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel + * + * The computed function is C = a * AxB + b * C. + * + * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8 + * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0 + * @param[in] output Output tensor info to store the result of matrix multiplication. + * If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32 + */ + static Error validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); protected: void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h index b8a36854dc..7c464c0b17 100644 --- a/arm_compute/core/SubTensorInfo.h +++ b/arm_compute/core/SubTensorInfo.h @@ -185,10 +185,11 @@ public: ARM_COMPUTE_ERROR_ON(_parent == nullptr); return _parent->is_resizable(); } - void set_is_resizable(bool is_resizable) override + ITensorInfo &set_is_resizable(bool is_resizable) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_is_resizable(is_resizable); + return *this; } ValidRegion valid_region() const override { diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index 5fd6c47818..80ef7f8d5a 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -280,9 +280,10 @@ public: { return _is_resizable; } - void set_is_resizable(bool is_resizable) override + ITensorInfo &set_is_resizable(bool is_resizable) override { _is_resizable = is_resizable; + return *this; } ValidRegion valid_region() const override { diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 0c441df4b9..598756e435 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -71,6 +71,15 @@ public: * @param[out] output Output tensor. Data type supported: Data type supported: S32 */ void configure(const ITensor *a, const ITensor *b, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + * + * @return an error status + */ + static Error validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index a3db23aaee..9270d5581f 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -73,6 +73,17 @@ public: * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale + * + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index a29b661a00..1f4d9b176e 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -40,6 +40,50 @@ using namespace arm_compute; namespace { +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + if(output->total_size() != 0) + { + TensorShape output_shape = input->tensor_shape(); + output_shape.set(0, input->dimension(0) * 4); + output_shape.set(1, std::ceil(input->dimension(1) / 4.0f)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window) { const size_t in_stride = input->info()->strides_in_bytes()[1]; @@ -132,10 +176,7 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); TensorShape output_shape = input->info()->tensor_shape(); output_shape.set(0, input->info()->dimension(0) * 4); @@ -144,21 +185,16 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; - unsigned int num_elems_processed_per_iteration_x = 4; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - switch(input->info()->element_size()) { case 1: - num_elems_processed_per_iteration_x = 8; - _func = &gemm_interleave_8bit_elements; + _func = &gemm_interleave_8bit_elements; break; case 2: _func = &gemm_interleave_16bit_elements; @@ -172,15 +208,17 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) } // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, output_access, input_access); + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); +Error NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp index 2a4a46e76c..e971dcba8e 100644 --- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp @@ -40,6 +40,60 @@ using namespace arm_compute; namespace { +TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height) +{ + TensorShape output_shape = input->tensor_shape(); + const float interleave_by_f32 = block_height; + output_shape.set(0, input->dimension(0) * interleave_by_f32); + output_shape.set(1, std::ceil(static_cast(input->dimension(1)) / interleave_by_f32)); + return output_shape; +} + +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0"); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height) +{ + const unsigned int num_elems_processed_per_iteration_x = block_width; + const unsigned int num_elems_processed_per_iteration_y = block_height; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + const float scaley_factor = 1.f / block_height; + + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, + 0, 0, + num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, + 1, num_elems_processed_per_iteration_y, scaley_factor); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height) { const size_t in_stride = input->info()->strides_in_bytes()[1]; @@ -122,20 +176,13 @@ NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel() void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0"); - ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0"); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = input->info()->tensor_shape(); - const float interleave_by_f32 = block_height; - output_shape.set(0, input->info()->dimension(0) * interleave_by_f32); - output_shape.set(1, std::ceil(static_cast(input->info()->dimension(1)) / interleave_by_f32)); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height)); _input = input; _output = output; @@ -143,20 +190,19 @@ void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *out _block_width = block_width; _transpose = transpose; - const unsigned int num_elems_processed_per_iteration_x = block_width; - const unsigned int num_elems_processed_per_iteration_y = block_height; - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - const float scaley_factor = 1.f / interleave_by_f32; - - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, output_access, input_access); + auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, input->info()->valid_region()); +Error NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose) +{ + ARM_COMPUTE_UNUSED(transpose); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 5f052f797d..2bc251e91f 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -45,6 +45,48 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + TensorShape in0_shape = input0->tensor_shape(); + TensorShape in1_shape = input1->tensor_shape(); + TensorShape out_shape = output->tensor_shape(); + + in0_shape.collapse(2); + in1_shape.collapse(2); + out_shape.collapse(2); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1)); + AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) { @@ -52,42 +94,29 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - // Check if matrix B should be slidden or not - // Don't slide matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - TensorShape in0_shape = input0->info()->tensor_shape(); TensorShape in1_shape = input1->info()->tensor_shape(); - TensorShape out_shape = output->info()->tensor_shape(); - - in0_shape.collapse(2); in1_shape.collapse(2); - out_shape.collapse(2); - - ARM_COMPUTE_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); _input0 = input0; _input1 = input1; _output = output; _slide_matrix_b = in1_shape[2] != 1; - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic in0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), input0->info()->dimension(1)); - AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - update_window_and_padding(win, in0_access, in1_access, output_access); +Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - INEKernel::configure(win); + return Error{}; } void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window) diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index bd550db54c..62f4014acb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -44,106 +44,131 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +namespace { -} - -void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +Error validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); - - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); - vector_sum_col_shape.collapse(1); - - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col_shape[1] != 1; + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr if(b_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1)); - TensorShape output_shape = mm_result->info()->tensor_shape(); - TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + TensorShape output_shape = mm_result->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse(1); output_shape.collapse(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; + return Error{}; +} +std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ constexpr unsigned int num_elems_processed_per_iteration = 16; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + mm_result_access); - // Accordingly with a_offset and b_offset, we can have 4 cases: - // a_offset != 0 && b_offset != 0 - // a_offset = 0 && b_offset != 0 - // a_offset != 0 && b_offset = 0 - // a_offset = 0 && b_offset = 0 - if(a_offset != 0 && b_offset != 0) + if(a_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - vector_sum_col_access, - vector_sum_row_access, - mm_result_access); + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + vector_sum_col_access); } - else if(a_offset == 0 && b_offset != 0) + if(b_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - - update_window_and_padding(win, - vector_sum_row_access, - mm_result_access); + AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT + window_changed = window_changed || update_window_and_padding(win, + vector_sum_row_access); } - else if(a_offset != 0 && b_offset == 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - vector_sum_col_access, - mm_result_access); - } - else + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + +NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +{ +} + +void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset)); // NOLINT + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) { - update_window_and_padding(win, - mm_result_access); + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT + vector_sum_col_shape.collapse(1); + + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col_shape[1] != 1; } - INEKernel::configure(win); + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Error NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + a_offset, b_offset) + .first); // NOLINT + + return Error{}; } void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp index 26aaa2a9d5..670b11fe67 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -40,6 +40,50 @@ using namespace arm_compute; namespace { +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(max > 255); + ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + } + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, + input_access, + output_result_access); + + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) { // Add the offset terms to GEMM's result @@ -185,17 +229,13 @@ NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON(max > 255); - ARM_COMPUTE_ERROR_ON(min < 0 || min > max); - - if(bias != nullptr) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0)); - } + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), + (bias != nullptr) ? bias->info() : nullptr, + output->info(), + min, + max)); _input = input; _bias = bias; @@ -206,34 +246,25 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp _min = min; _max = max; - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_result_access); - - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]); - - update_window_and_padding(win, - bias_access); - } - - output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - INEKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); + // Check if we need to clamp the result using min and max const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255)); + _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run; +} - // Check if we need to clamp the result using min and max - _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run; +Error NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + (bias != nullptr) ? bias->clone().get() : nullptr, + output->clone().get()) + .first); + + return Error{}; } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 81d9b5bb81..a8395a15cb 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -44,6 +44,59 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Error validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Error{}; +} +std::pair validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped) +{ + const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} + +Error validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Error{}; +} + +std::pair validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() : _input(), _output(), _k(0), _is_reshaped(false) { @@ -51,29 +104,28 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); _input = mtx_a; _output = vector_sum_row; _k = num_mtx_a_cols; _is_reshaped = is_interleaved4x4; - const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1; - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Error NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) +{ + ARM_COMPUTE_UNUSED(num_mtx_a_cols); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info) @@ -200,29 +252,28 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); _input = mtx_b; _output = vector_sum_col; _k = num_mtx_b_rows; _is_reshaped = is_transposed1xW; - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Error NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) +{ + ARM_COMPUTE_UNUSED(num_mtx_b_rows); + ARM_COMPUTE_UNUSED(is_transposed1xW); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 7f83144e12..7d79c664d1 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -41,45 +41,87 @@ using namespace arm_compute; -void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) +namespace +{ +TensorShape get_output_shape(const ITensorInfo *input) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, - DataType::F16, - DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + TensorShape output_shape{ input->tensor_shape() }; + const size_t transpose_w = 16 / input->element_size(); + output_shape.set(0, input->dimension(1) * transpose_w); + output_shape.set(1, static_cast(std::ceil((input->dimension(0) / static_cast(transpose_w))))); + return output_shape; +} - TensorShape output_shape{ input->info()->tensor_shape() }; - const size_t transpose_w = 16 / input->info()->element_size(); - output_shape.set(0, input->info()->dimension(1) * transpose_w); - output_shape.set(1, static_cast(std::ceil((input->info()->dimension(0) / static_cast(transpose_w))))); +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + return Error{}; +} - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); const int scale_x = num_elems_processed_per_iteration; - - _input = input; - _output = output; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension"); - AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), - output_access); + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); + } - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape())); + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + +void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Error NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp index ae711af89a..a025cac82d 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp @@ -48,13 +48,47 @@ namespace arm_compute // Enable only if compiled for AArch64-V8.2-A targets #ifdef ARM_COMPUTE_AARCH64_V8_2 +namespace +{ +using namespace arm_compute; + +Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*output); + + AccessWindowRectangle output_access(output, 0, 0, 12, 8); + + const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8); + const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12); + + bool window_changed = update_window_and_padding(win, + AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()), + AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()), + output_access); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + namespace arm_compute { void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); _input0 = input0; _input1 = input1; @@ -66,19 +100,17 @@ void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, cons _transform_1 = transform_1; // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); +Error NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - INEKernel::configure(win); + return Error{}; } void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 0fff6c9ca1..92c911c370 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -54,12 +54,8 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info())); bool dot_product_path = false; @@ -185,6 +181,86 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } } +Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1), + "The output matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0), + "The output matrix must have the same number of columns as the matrix B"); + + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + +#ifdef ARM_COMPUTE_AARCH64_V8_2 + // Check for DOT product instruction + const struct CPUInfo ci = NEScheduler::get().cpu_info(); + const int cpu_has_dotprod = static_cast(ci.CPU) & static_cast(CPUTarget::DOT); + + if(cpu_has_dotprod != 0) + { + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ERROR_ON(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output)); + } + else +#endif /* ARM_COMPUTE_AARCH64_V8_2 */ + { + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + } + + TensorInfo info_vector_sum_col, info_vector_sum_row; + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if(a_offset != 0) + { + TensorShape shape_vector_sum_col = b->tensor_shape(); + shape_vector_sum_col.remove_dimension(1); + info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if(b_offset != 0) + { + TensorShape shape_vector_sum_row = a->tensor_shape(); + shape_vector_sum_row.set(Window::DimX, a->dimension(1)); + shape_vector_sum_row.remove_dimension(1); + info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + a_offset, b_offset)); + + return Error{}; +} + void NEGEMMLowpMatrixMultiplyCore::run() { _memory_group.acquire(); diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 66cdf58634..ed51291e95 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -34,4 +34,9 @@ void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, co auto k = arm_compute::support::cpp14::make_unique(); k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max); _kernel = std::move(k); +} + +Error NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + return NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max); } \ No newline at end of file diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index 078096a0dd..1418578a51 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -113,6 +113,39 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c gemmlowp_mm.configure(&a, &b, &c); } +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( + framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4 + TensorInfo(TensorShape(21U, 13U), 1, DataType::QS8, 2), // Mismatching data type + TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions + TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions + TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), + }), + framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + })), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(8U, 11U), 1, DataType::S32), + TensorInfo(TensorShape(64U, 32U), 1, DataType::S32), + })), + framework::dataset::make("Expected", { true, true, true, true, false })), + a_info, b_info, output_info, expected) +{ + // Lock tensors + Error error = NEGEMMLowpMatrixMultiplyCore::validate(&a_info.clone()->set_is_resizable(false), + &b_info.clone()->set_is_resizable(false), + &output_info.clone()->set_is_resizable(false)); + ARM_COMPUTE_EXPECT(bool(error) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset()) { // Validate output -- cgit v1.2.1