From 4a578b923ed000c67fe0bc1433f945aea634ca9c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 25 Jun 2021 12:13:49 +0100 Subject: Port the ClGemmLowp kernels to the new API Ported kernels: - CLGEMMLowpMatrixMultiplyNativeKernel - CLGEMMLowpMatrixMultiplyReshapedKernel - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMLowpOffsetContributionKernel - CLGEMMLowpOffsetContributionOutputStageKernel - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - CLGEMMLowpQuantizeDownInt32ScaleKernel Signed-off-by: Georgios Pinitas Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- Android.bp | 18 +- arm_compute/core/experimental/Types.h | 39 +- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 31 +- .../runtime/CL/functions/CLGEMMLowpOutputStage.h | 260 +--------- .../runtime/CL/functions/CLLSTMLayerQuantized.h | 68 +-- arm_compute/runtime/CL/functions/CLQLSTMLayer.h | 140 ++--- docs/user_guide/release_version_and_change_log.dox | 44 +- examples/gemm_tuner/cl_gemmlowp_reshaped.cpp | 19 +- ...aped_rhs_only_fused_output_stage_fixedpoint.cpp | 30 +- filelist.json | 78 +-- src/core/CL/CLKernels.h | 9 - .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp | 335 ------------ .../kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h | 108 ---- .../CLGEMMLowpMatrixMultiplyReshapedKernel.cpp | 298 ----------- .../CLGEMMLowpMatrixMultiplyReshapedKernel.h | 125 ----- ...GEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp | 574 --------------------- ...CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h | 155 ------ .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 215 -------- .../kernels/CLGEMMLowpOffsetContributionKernel.h | 116 ----- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 274 ---------- ...CLGEMMLowpOffsetContributionOutputStageKernel.h | 136 ----- ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 153 ------ ...MLowpQuantizeDownInt32ScaleByFixedPointKernel.h | 89 ---- ...GEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 ------ ...CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h | 101 ---- .../CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp | 158 ------ .../CLGEMMLowpQuantizeDownInt32ScaleKernel.h | 102 ---- src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp | 221 -------- src/core/CL/kernels/CLGEMMLowpReductionKernel.h | 176 ------- .../ClGemmLowpMatrixMultiplyNativeKernel.cpp | 335 ++++++++++++ .../kernels/ClGemmLowpMatrixMultiplyNativeKernel.h | 81 +++ .../ClGemmLowpMatrixMultiplyReshapedKernel.cpp | 300 +++++++++++ .../ClGemmLowpMatrixMultiplyReshapedKernel.h | 90 ++++ ...GemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp | 544 +++++++++++++++++++ ...ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h | 100 ++++ .../kernels/ClGemmLowpOffsetContributionKernel.cpp | 212 ++++++++ .../kernels/ClGemmLowpOffsetContributionKernel.h | 86 +++ ...GemmLowpOffsetContributionOutputStageKernel.cpp | 263 ++++++++++ ...ClGemmLowpOffsetContributionOutputStageKernel.h | 90 ++++ ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 160 ++++++ ...mLowpQuantizeDownInt32ScaleByFixedPointKernel.h | 78 +++ ...GemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 ++++++ ...ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h | 80 +++ .../ClGemmLowpQuantizeDownInt32ScaleKernel.cpp | 157 ++++++ .../ClGemmLowpQuantizeDownInt32ScaleKernel.h | 80 +++ .../gpu/cl/kernels/ClGemmLowpReductionKernel.cpp | 219 ++++++++ .../gpu/cl/kernels/ClGemmLowpReductionKernel.h | 124 +++++ src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 5 - .../CL/functions/CLGEMMConvolutionLayer.cpp | 5 - .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 5 - .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 157 ++++-- src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 138 ++--- src/runtime/CL/functions/CLLSTMLayer.cpp | 5 - src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 20 +- src/runtime/CL/functions/CLQLSTMLayer.cpp | 100 ++-- src/runtime/CL/functions/CLRNNLayer.cpp | 5 - tests/validation/CL/GEMMLowp.cpp | 134 ----- .../validation/CL/GEMMLowpMatrixMultiplyNative.cpp | 6 +- .../CL/GEMMLowpMatrixMultiplyReshaped.cpp | 4 +- .../CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp | 6 +- tests/validation/fixtures/GEMMLowpFixture.h | 30 +- 61 files changed, 3639 insertions(+), 4372 deletions(-) delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h diff --git a/Android.bp b/Android.bp index 6250539d02..822afc1eca 100644 --- a/Android.bp +++ b/Android.bp @@ -96,15 +96,6 @@ cc_library_static { "src/core/CL/kernels/CLFFTScaleKernel.cpp", "src/core/CL/kernels/CLFillBorderKernel.cpp", "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp", "src/core/CL/kernels/CLGatherKernel.cpp", "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp", "src/core/CL/kernels/CLIm2ColKernel.cpp", @@ -351,6 +342,15 @@ cc_library_static { "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp", "src/core/gpu/cl/kernels/ClFillKernel.cpp", "src/core/gpu/cl/kernels/ClFloorKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp", diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index 92ece460dc..1983344075 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -39,15 +39,24 @@ enum TensorType : int32_t { ACL_UNKNOWN = -1, ACL_SRC_DST = 0, - ACL_SRC = 0, - ACL_SRC_0 = 0, - ACL_SRC_1 = 1, - ACL_SRC_2 = 2, - ACL_DST = 30, - ACL_DST_0 = 30, - ACL_DST_1 = 31, - ACL_DST_2 = 32, - ACL_BIAS = ACL_SRC_2, + + // Src + ACL_SRC = 0, + ACL_SRC_0 = 0, + ACL_SRC_1 = 1, + ACL_SRC_2 = 2, + ACL_SRC_3 = 3, + ACL_SRC_4 = 4, + ACL_SRC_5 = 5, + ACL_SRC_6 = 6, + + // Dst + ACL_DST = 30, + ACL_DST_0 = 30, + ACL_DST_1 = 31, + ACL_DST_2 = 32, + + // Aux ACL_INT = 50, ACL_INT_0 = 50, ACL_INT_1 = 51, @@ -56,7 +65,17 @@ enum TensorType : int32_t ACL_INT_4 = 54, ACL_SRC_VEC = 256, ACL_DST_VEC = 512, - ACL_INT_VEC = 1024 + ACL_INT_VEC = 1024, + + // Aliasing Types + // Conv etc + ACL_BIAS = ACL_SRC_2, + + // Gemm + ACL_VEC_ROW_SUM = ACL_SRC_3, + ACL_VEC_COL_SUM = ACL_SRC_4, + ACL_SHIFTS = ACL_SRC_5, + ACL_MULTIPLIERS = ACL_SRC_6, }; namespace experimental diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index 3d2dbdb104..e62db8e644 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -34,17 +34,17 @@ class CLCompileContext; class IMemoryManager; class ICLTensor; class ITensorInfo; -class CLGEMMLowpMatrixMultiplyNativeKernel; -class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel; -class CLGEMMLowpOffsetContributionKernel; -class CLGEMMLowpOffsetContributionOutputStageKernel; -class CLGEMMLowpMatrixAReductionKernel; -class CLGEMMLowpMatrixBReductionKernel; namespace opencl { namespace kernels { class ClGemmReshapeRhsMatrixKernel; +class ClGemmLowpMatrixMultiplyNativeKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel; +class ClGemmLowpOffsetContributionKernel; +class ClGemmLowpOffsetContributionOutputStageKernel; +class ClGemmLowpMatrixAReductionKernel; +class ClGemmLowpMatrixBReductionKernel; } // namespace kernels } // namespace opencl @@ -150,14 +150,14 @@ private: MemoryGroup _memory_group; // Kernels used - std::unique_ptr _weights_to_qasymm8; - std::unique_ptr _mm_native_kernel; - std::unique_ptr _mm_reshaped_only_rhs_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - std::unique_ptr _mtx_a_reduction_kernel; - std::unique_ptr _mtx_b_reduction_kernel; - std::unique_ptr _offset_contribution_kernel; - std::unique_ptr _offset_contribution_output_stage_kernel; + std::unique_ptr _weights_to_qasymm8; + std::unique_ptr _mm_native_kernel; + std::unique_ptr _mm_reshaped_only_rhs_kernel; + std::unique_ptr _mtx_b_reshape_kernel; + std::unique_ptr _mtx_a_reduction_kernel; + std::unique_ptr _mtx_b_reduction_kernel; + std::unique_ptr _offset_contribution_kernel; + std::unique_ptr _offset_contribution_output_stage_kernel; // Temporary tensors CLTensor _qasymm8_weights; @@ -171,7 +171,8 @@ private: // Tensor pointers const ICLTensor *_matrix_a; const ICLTensor *_original_b; - const ICLTensor *_output; + const ICLTensor *_c; + ICLTensor *_output; int32_t _a_offset; int32_t _b_offset; diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h index a60992a0f4..e85f2db8a9 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -45,247 +45,28 @@ class ICLTensor; class ITensorInfo; struct GEMMLowpOutputStageInfo; -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following OpenCL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, - int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); -}; - -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following OpenCL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, - int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); -}; - -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: - * - * result_fixedpoint_multiplier, result_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following CL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits::lowest(), - int max = std::numeric_limits::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint - * - * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor info. Data type supported: QSYMM16 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits::lowest(), int max = std::numeric_limits::max()); -}; /** Basic function to execute GEMMLowpQuantizeDown kernels on CL. * * This function calls the following CL kernels: * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel */ -class CLGEMMLowpOutputStage : public ICLSimpleFunction +class CLGEMMLowpOutputStage : public IFunction { public: + CLGEMMLowpOutputStage(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete; + /** Default move constructor */ + CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete; + /** Default move assignment operator */ + CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&); + /** Default destructor */ + ~CLGEMMLowpOutputStage(); /** Initialise the kernel's inputs, output * * Valid data layouts: @@ -315,7 +96,7 @@ public: * @param[in] info GEMMLowp output stage metadata. */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. @@ -326,6 +107,15 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + std::unique_ptr _kernel; + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */ diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h index 2ef7427a5a..9c004b85d0 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h @@ -47,16 +47,16 @@ class ICLTensor; * * This function calls the following CL functions/kernels: * - * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref CLTranspose Matrix transpose - * -# @ref CLConcatenateLayer Tensor concatenation - * -# @ref CLActivationLayer Activation functions (tanh and logistic) - * -# @ref CLArithmeticAddition Elementwise addition - * -# @ref CLPixelWiseMultiplication Elementwise multiplication - * -# @ref CLSlice Tensor slicing - * -# @ref CLDequantizationLayer Dequantize into float - * -# @ref CLQuantizationLayer Quantize from float + * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers + * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref CLTranspose Matrix transpose + * -# @ref CLConcatenateLayer Tensor concatenation + * -# @ref CLActivationLayer Activation functions (tanh and logistic) + * -# @ref CLArithmeticAddition Elementwise addition + * -# @ref CLPixelWiseMultiplication Elementwise multiplication + * -# @ref CLSlice Tensor slicing + * -# @ref CLDequantizationLayer Dequantize into float + * -# @ref CLQuantizationLayer Quantize from float * */ class CLLSTMLayerQuantized : public IFunction { @@ -170,30 +170,30 @@ private: MemoryGroup _memory_group; // Functions used - CLGEMMLowpMatrixMultiplyCore _gemmlowp; - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage; - CLTranspose _transpose_weights; - CLConcatenateLayer _concat_input_weights; - CLConcatenateLayer _concat_recurrent_weights; - CLConcatenateLayer _concat_weights; - CLConcatenateLayer _concat_inputs; - CLConcatenateLayer _concat_bias; - CLActivationLayer _sigmoid_forget_gate; - CLActivationLayer _sigmoid_input_gate; - CLActivationLayer _sigmoid_output_gate; - CLActivationLayer _tanh_modulation_gate; - CLActivationLayer _tanh_output_state; - CLArithmeticAddition _add_cell_state_tmps; - CLArithmeticAddition _add2; - CLPixelWiseMultiplication _mul_forget_gate_cell_state; - CLPixelWiseMultiplication _mul_input_gate_input_mod_gate; - CLPixelWiseMultiplication _mul_output_state_tmp_output_gate; - CLSlice _slice_input_tensor; - CLSlice _slice_forget_tensor; - CLSlice _slice_cell_tensor; - CLSlice _slice_output_tensor; - CLDequantizationLayer _dequantize; - CLQuantizationLayer _quantize; + CLGEMMLowpMatrixMultiplyCore _gemmlowp; + CLGEMMLowpOutputStage _output_stage; + CLTranspose _transpose_weights; + CLConcatenateLayer _concat_input_weights; + CLConcatenateLayer _concat_recurrent_weights; + CLConcatenateLayer _concat_weights; + CLConcatenateLayer _concat_inputs; + CLConcatenateLayer _concat_bias; + CLActivationLayer _sigmoid_forget_gate; + CLActivationLayer _sigmoid_input_gate; + CLActivationLayer _sigmoid_output_gate; + CLActivationLayer _tanh_modulation_gate; + CLActivationLayer _tanh_output_state; + CLArithmeticAddition _add_cell_state_tmps; + CLArithmeticAddition _add2; + CLPixelWiseMultiplication _mul_forget_gate_cell_state; + CLPixelWiseMultiplication _mul_input_gate_input_mod_gate; + CLPixelWiseMultiplication _mul_output_state_tmp_output_gate; + CLSlice _slice_input_tensor; + CLSlice _slice_forget_tensor; + CLSlice _slice_cell_tensor; + CLSlice _slice_output_tensor; + CLDequantizationLayer _dequantize; + CLQuantizationLayer _quantize; // Tensor pointers const ICLTensor *_input_to_input_weights; diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index bd00d56468..1b0b759d74 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -40,9 +40,15 @@ namespace arm_compute // Forward declarations class CLCompileContext; class ICLTensor; -class CLGEMMLowpMatrixAReductionKernel; class CLQLSTMLayerNormalizationKernel; class ITensorInfo; +namespace opencl +{ +namespace kernels +{ +class ClGemmLowpMatrixAReductionKernel; +} // namespace kernels +} // namespace opencl /** Basic function to run @ref CLQLSTMLayer * @@ -52,8 +58,8 @@ class ITensorInfo; * -# @ref CLCopy Copy function for copying output_state_out to output * -# @ref CLArithmeticAddition Elementwise addition and subtraction * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use + * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel For precomputing effective biases to use * -# @ref CLPixelWiseMultiplication Elementwise multiplication * -# @ref CLTranspose Transpose function for reshaping the weights * */ @@ -297,70 +303,70 @@ private: }; // Functions used - CLTranspose _transpose_input_to_forget_weights{}; - CLTranspose _transpose_input_to_cell_weights{}; - CLTranspose _transpose_input_to_output_weights{}; - CLTranspose _transpose_input_to_input_weights{}; - CLTranspose _transpose_recurrent_to_forget_weights{}; - CLTranspose _transpose_recurrent_to_cell_weights{}; - CLTranspose _transpose_recurrent_to_output_weights{}; - CLTranspose _transpose_recurrent_to_input_weights{}; - CLTranspose _transpose_projection_weights{}; - std::unique_ptr _input_to_input_reduction; - std::unique_ptr _recurrent_to_input_reduction; - std::unique_ptr _input_to_forget_reduction; - std::unique_ptr _recurrent_to_forget_reduction; - std::unique_ptr _input_to_cell_reduction; - std::unique_ptr _recurrent_to_cell_reduction; - std::unique_ptr _input_to_output_reduction; - std::unique_ptr _recurrent_to_output_reduction; - std::unique_ptr _projection_reduction; - CLArithmeticAddition _projection_bias_add{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; - CLGEMMLowpOutputStage _input_to_forget_outstage{}; - CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; - CLGEMMLowpOutputStage _cell_to_forget_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_forget{}; - CLArithmeticAddition _accumulate_cell_forget{}; - CLActivationLayer _forget_gate_sigmoid{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; - CLGEMMLowpOutputStage _input_to_cell_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; - CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_modulation{}; - CLActivationLayer _cell_gate_tanh{}; - CLArithmeticSubtraction _input_gate_sub{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; - CLGEMMLowpOutputStage _input_to_input_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; - CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_input{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; - CLGEMMLowpOutputStage _cell_to_input_outstage{}; - CLArithmeticAddition _accumulate_cell_input{}; - CLActivationLayer _input_gate_sigmoid{}; - CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; - CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; - CLArithmeticAddition _add_forget_cell{}; - CLActivationLayer _cell_clip{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; - CLGEMMLowpOutputStage _input_to_output_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; - CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_output{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; - CLGEMMLowpOutputStage _cell_to_output_outstage{}; - CLArithmeticAddition _accumulate_cell_to_output{}; - CLActivationLayer _output_gate_sigmoid{}; - CLActivationLayer _hidden_tanh{}; - CLPixelWiseMultiplication _pixelwise_mul_hidden{}; - CLGEMMLowpOutputStage _hidden_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_projection{}; - CLGEMMLowpOutputStage _projection_outstage{}; - CLArithmeticAddition _accumulate_projection{}; - CLActivationLayer _projection_clip{}; + CLTranspose _transpose_input_to_forget_weights{}; + CLTranspose _transpose_input_to_cell_weights{}; + CLTranspose _transpose_input_to_output_weights{}; + CLTranspose _transpose_input_to_input_weights{}; + CLTranspose _transpose_recurrent_to_forget_weights{}; + CLTranspose _transpose_recurrent_to_cell_weights{}; + CLTranspose _transpose_recurrent_to_output_weights{}; + CLTranspose _transpose_recurrent_to_input_weights{}; + CLTranspose _transpose_projection_weights{}; + std::unique_ptr _input_to_input_reduction; + std::unique_ptr _recurrent_to_input_reduction; + std::unique_ptr _input_to_forget_reduction; + std::unique_ptr _recurrent_to_forget_reduction; + std::unique_ptr _input_to_cell_reduction; + std::unique_ptr _recurrent_to_cell_reduction; + std::unique_ptr _input_to_output_reduction; + std::unique_ptr _recurrent_to_output_reduction; + std::unique_ptr _projection_reduction; + CLArithmeticAddition _projection_bias_add{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; + CLGEMMLowpOutputStage _input_to_forget_outstage{}; + CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; + CLGEMMLowpOutputStage _cell_to_forget_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_forget{}; + CLArithmeticAddition _accumulate_cell_forget{}; + CLActivationLayer _forget_gate_sigmoid{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; + CLGEMMLowpOutputStage _input_to_cell_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; + CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_modulation{}; + CLActivationLayer _cell_gate_tanh{}; + CLArithmeticSubtraction _input_gate_sub{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; + CLGEMMLowpOutputStage _input_to_input_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; + CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_input{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; + CLGEMMLowpOutputStage _cell_to_input_outstage{}; + CLArithmeticAddition _accumulate_cell_input{}; + CLActivationLayer _input_gate_sigmoid{}; + CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; + CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; + CLArithmeticAddition _add_forget_cell{}; + CLActivationLayer _cell_clip{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; + CLGEMMLowpOutputStage _input_to_output_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; + CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_output{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; + CLGEMMLowpOutputStage _cell_to_output_outstage{}; + CLArithmeticAddition _accumulate_cell_to_output{}; + CLActivationLayer _output_gate_sigmoid{}; + CLActivationLayer _hidden_tanh{}; + CLPixelWiseMultiplication _pixelwise_mul_hidden{}; + CLGEMMLowpOutputStage _hidden_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_projection{}; + CLGEMMLowpOutputStage _projection_outstage{}; + CLArithmeticAddition _accumulate_projection{}; + CLActivationLayer _projection_clip{}; std::array, _layer_norm_count> _layer_norms; CLCopy _copy_output; diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index eb4c280295..0c8b57ff9f 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -227,7 +227,7 @@ v20.11 Public major release - @ref CLLogSoftmaxLayer - GCSoftmaxLayer - New OpenCL kernels / functions: - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - @ref CLLogicalNot - @ref CLLogicalAnd - @ref CLLogicalOr @@ -260,13 +260,13 @@ v20.11 Public major release - @ref CLBatchNormalizationLayerKernel - CLPoolingLayerKernel - CLWinogradInputTransformKernel - - @ref CLGEMMLowpMatrixMultiplyNativeKernel - - @ref CLGEMMLowpMatrixAReductionKernel - - @ref CLGEMMLowpMatrixBReductionKernel - - @ref CLGEMMLowpOffsetContributionOutputStageKernel - - @ref CLGEMMLowpOffsetContributionKernel + - CLGEMMLowpMatrixMultiplyNativeKernel + - CLGEMMLowpMatrixAReductionKernel + - CLGEMMLowpMatrixBReductionKernel + - CLGEMMLowpOffsetContributionOutputStageKernel + - CLGEMMLowpOffsetContributionKernel - CLWinogradOutputTransformKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel - @ref CLFuseBatchNormalizationKernel - @ref CLDepthwiseConvolutionLayerNativeKernel - CLDepthConvertLayerKernel @@ -281,11 +281,11 @@ v20.11 Public major release - CLLogits1DNormKernel - CLHeightConcatenateLayerKernel - CLGEMMMatrixMultiplyKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpQuantizeDownInt32ScaleKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLDepthConcatenateLayerKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - Removed OpenCL kernels / functions: - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel @@ -596,9 +596,9 @@ v20.05 Public major release - @ref CLDeconvolutionLayer - @ref CLDirectDeconvolutionLayer - @ref CLGEMMDeconvolutionLayer - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpQuantizeDownInt32ScaleKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - @ref CLReductionOperation - @ref CLReduceMean - @ref NEScale @@ -655,9 +655,9 @@ v20.02 Public major release - @ref CLDepthwiseConvolutionLayer - CLDepthwiseConvolutionLayer3x3 - @ref CLGEMMConvolutionLayer - - @ref CLGEMMLowpMatrixMultiplyCore - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - - @ref CLGEMMLowpMatrixMultiplyNativeKernel + - CLGEMMLowpMatrixMultiplyCore + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpMatrixMultiplyNativeKernel - @ref NEActivationLayer - NEComparisonOperationKernel - @ref NEConvolutionLayer @@ -680,7 +680,7 @@ v20.02 Public major release - @ref NESplit - New OpenCL kernels / functions: - @ref CLFill - - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint + - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - New Arm® Neon™ kernels / functions: - @ref NEFill - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint @@ -861,7 +861,7 @@ v19.05 Public major release - @ref CLFFTDigitReverseKernel - @ref CLFFTRadixStageKernel - @ref CLFFTScaleKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel - CLHeightConcatenateLayerKernel - @ref CLDirectDeconvolutionLayer @@ -953,7 +953,7 @@ v19.02 Public major release - @ref CLRangeKernel / @ref CLRange - @ref CLUnstack - @ref CLGatherKernel / @ref CLGather - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel - New CPP kernels / functions: - @ref CPPDetectionOutputLayer - @ref CPPTopKV / @ref CPPTopKVKernel @@ -1247,8 +1247,8 @@ v17.12 Public major release - NEWinogradLayer / NEWinogradLayerKernel - New OpenCL kernels / functions - - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore - - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint + - CLGEMMLowpOffsetContributionKernel / CLGEMMLowpMatrixAReductionKernel / CLGEMMLowpMatrixBReductionKernel / CLGEMMLowpMatrixMultiplyCore + - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - New graph nodes for Arm® Neon™ and OpenCL - graph::BranchLayer diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp index 3d3f7fef1e..3c8976fddd 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp @@ -33,7 +33,7 @@ #include "arm_compute/runtime/CL/CLTuner.h" #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" #include "utils/Utils.h" @@ -168,8 +168,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMReshapeLHSMatrix = test::CLSynthetizeOperator; -using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction; +using ClGemmReshapeLHSMatrix = test::CLSynthetizeOperator; +using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator; class CLGEMMLowpMatrixMultiplyReshapedExample : public Example { @@ -269,20 +269,20 @@ public: // Validate argments if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) { - std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl; return false; } if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl; return false; } // Configure functions reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info); // Allocate tensors lhs.allocator()->allocate(); @@ -298,7 +298,8 @@ public: ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } }); reshape_lhs.run(reshape_lsh_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -315,8 +316,8 @@ private: CLTensor rhs_reshaped{}; CLTensor dst{}; CLTuner tuner{}; - CLGEMMReshapeLHSMatrix reshape_lhs{}; - CLGEMMLowpMatrixMultiplyReshaped gemm{}; + ClGemmReshapeLHSMatrix reshape_lhs{}; + ClGemmLowpMatrixMultiplyReshaped gemm{}; }; /** Main test program for gemmlowp reshaped diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index ca7b7a5f04..15c1b86c61 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -35,8 +35,8 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "tests/CL/Helper.h" #include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" @@ -47,6 +47,7 @@ using namespace arm_compute; using namespace utils; +using namespace arm_compute::opencl::kernels; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -146,8 +147,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction; -using CLGEMMLowpMatrixAReduction = test::CLSynthetizeFunction; +using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator; +using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator; class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example { @@ -289,7 +290,7 @@ public: const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32); vector_sum_row.allocator()->init(info_vector_sum_row); - mtx_a_reduction = std::make_unique(); + mtx_a_reduction = std::make_unique(); if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) { @@ -297,7 +298,7 @@ public: return false; } - mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{}); + mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}); } // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 if(gemm_info.a_offset != 0) @@ -311,12 +312,14 @@ public: if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info())) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl; return false; } // Configure function - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), + bias.info(), dst_multipliers.info(), dst_shifts.info()); // Allocate tensors lhs.allocator()->allocate(); @@ -335,9 +338,12 @@ public: { if(mtx_a_reduction != nullptr) { - mtx_a_reduction->run(); + ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } }); + mtx_a_reduction->run(red_pack); } - gemm.run(); + + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -358,8 +364,8 @@ private: CLTensor dst_multipliers{}; CLTensor dst_shifts{}; CLTuner tuner{}; - CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm{}; - std::unique_ptr mtx_a_reduction{ nullptr }; + ClGemmLowpMatrixMultiplyReshapedOnlyRhs gemm{}; + std::unique_ptr mtx_a_reduction{ nullptr }; }; /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint diff --git a/filelist.json b/filelist.json index bde361f167..8a53829c5b 100644 --- a/filelist.json +++ b/filelist.json @@ -229,6 +229,21 @@ ] } }, + "GEMMLowp": { + "files": { + "kernel": [ + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" + ] + } + }, "Mul": { "files": { "operator": [ @@ -420,69 +435,6 @@ ] } }, - "GEMMLowpMatrixMultiplyNative": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp" - ] - } - }, - "GEMMLowpMatrixMultiplyReshaped": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp" - ] - } - }, - "GEMMLowpMatrixMultiplyReshapedOnlyRHS": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp" - ] - } - }, - "GEMMLowpOffsetContribution": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp" - ] - } - }, - "GEMMLowpOffsetContributionOutputStage": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32ScaleByFixedPoint": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32ScaleByFloat": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32Scale": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp" - ] - } - }, - "GEMMLowpReduction": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp" - ] - } - }, "InstanceNormalization": { "files": { "kernel": [ diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h index c380a9cc68..09f8109445 100644 --- a/src/core/CL/CLKernels.h +++ b/src/core/CL/CLKernels.h @@ -42,15 +42,6 @@ #include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLGatherKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp deleted file mode 100644 index efba3d1d7a..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - ARM_COMPUTE_UNUSED(m); - ARM_COMPUTE_UNUSED(n); - ARM_COMPUTE_UNUSED(k); - - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast(n)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); - } - - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - Window win{}; - bool window_changed = false; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*output); - - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // RHS matrix still needs padding on the X - AccessWindowStatic input1_access(input1, 0, 0, - ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), - input1->dimension(1)); - - window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ input0, output }); - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1))); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - std::string kernel_name("gemmlowp_mm_native"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h deleted file mode 100644 index 125f0c6948..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ -class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyNativeKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS matrix. Data type supported: same as @p input0 - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp deleted file mode 100644 index fd533fc310..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - TensorShape tensor_shape0{ input0->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - TensorShape tensor_shape1{ input1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); - - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*output); - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - return std::make_pair(Status{}, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _k = gemm_info.k(); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - auto padding_info = get_padding_info({ input0, input1, output }); - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); - - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m())); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - std::string kernel_name("gemmlowp_mm_reshaped_"); - kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; - kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.v0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.interleave); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_k)); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h deleted file mode 100644 index 06a73f173d..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped - * - * @note The input matrices @p input0 and @p input1 must be reshaped through: - * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel - * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - */ -class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel - * - * @param[in] input0 Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_output_as_3d; - unsigned int _k; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp deleted file mode 100644 index 3424b6d264..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m; - const int n = gemm_info.n; - const int k = gemm_info.k; - - TensorShape tensor_shape1{ input1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d) - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); - - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - if(output_stage.type == GEMMLowpOutputStageType::NONE) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); - } - } - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), - "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); - - // Checks performed if the output stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2])); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]); - - if(expected_output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - TensorShape collapsed_output_shape(expected_output_shape); - collapsed_output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx], - "vector_sum_row must have the same number of batches of output tensor"); - - if(gemm_info.a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); - } - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - - if(output_multipliers != nullptr && output_shifts != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0)); - } - } - } - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) -{ - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // Output tensor auto initialization if not yet initialized - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) - { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type)); - } - else - { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32)); - } - - TensorInfo tmp_info(*output); - - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; - num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - if(gemm_info.a_offset != 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); - } - // No access window needed for vector_sum_row - ARM_COMPUTE_UNUSED(vector_sum_row); - - if(bias != nullptr) - { - AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, bias_access); - } - - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) - { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); - AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); - } - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel() - : _input0(nullptr), - _input1(nullptr), - _output(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _slide_matrix_b(true), - _reinterpret_input_as_3d(false), - _reinterpret_output_as_3d(false), - _use_dummy_work_items(false), - _is_quantized_per_channel(false), - _fuse_output_stage(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts); -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, - const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr)); - - auto padding_info = get_padding_info({ input0, input1, output, vector_sum_row }); - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - const int32_t a_offset = gemm_info.a_offset; - const int32_t b_offset = gemm_info.b_offset; - - _input0 = input0; - _input1 = input1; - _output = output; - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _bias = bias; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr, - num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % internal_m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - - std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); - kernel_name += rhs_info.transpose ? "t" : "nt"; - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - kernel_name += "_fused_output_stage_fixedpoint"; - _fuse_output_stage = true; - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - build_opts.add_option_if(min != min_val.get(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if(max != max_val.get(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - idx++; - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - idx++; - } - - if(_fuse_output_stage) - { - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); - } - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h deleted file mode 100644 index e79f6dfe05..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped - * - * @note The input matrix input1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported - */ -class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] output Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector info of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector info of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr, - const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr, - const ITensorInfo *output_shifts = nullptr); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; - bool _is_quantized_per_channel; - bool _fuse_output_stage; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp deleted file mode 100644 index b97bb84e59..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - return Status{}; -} -} // namespace - -CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset); -} - -void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, - int32_t k, int32_t a_offset, - int32_t b_offset) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - a_offset, b_offset)); // NOLINT - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - std::string kernel_name("gemmlowp_offset_contribution"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - return Status{}; -} - -void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h deleted file mode 100644 index f8705595a0..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The final result is: - * - * mm_result[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - * - */ -class CLGEMMLowpOffsetContributionKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - ICLTensor *_mm_result; - const ICLTensor *_bias; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp deleted file mode 100644 index eaa832fbaa..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); - // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); - - return Status{}; -} -} // namespace - -CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel() - : _mm_result(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _is_quantized_per_channel(false) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts); -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, ICLTensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output->info(), - a_offset, b_offset, output_stage, - output_multipliers->info(), output_shifts->info())); // NOLINT - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts }); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - _output = output; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - // Auto initialize the output - auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - build_opts.add_option_if((min > min_val.get()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < max_val.get()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - - std::string kernel_name("gemmlowp_offset_contribution"); - kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); - return Status{}; -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_3D_tensor_argument(idx, _output, slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 15f54d17a5..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution - * of matrix A and matrix B and performs the output stage defined by the output_stage argument - * - * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. - */ -class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionOutputStageKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, - int32_t k, - int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_mm_result; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - ICLTensor *_output; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _is_quantized_per_channel; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp deleted file mode 100644 index 03bdf3f836..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} // namespace - -CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - // Set the arguments to pass at compile time - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h deleted file mode 100644 index 8653102cd8..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp deleted file mode 100644 index abef484c1e..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) - || info->gemmlowp_min_bound > info->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} // namespace - -class Coordinates; -CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); - build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h deleted file mode 100644 index 0a8d5e1942..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Requantize - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to - * - to the [0..255] range and cast to QASYMM8. - * - to the [-128..127] range and cast to QASYMM8_SIGNED. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp deleted file mode 100644 index faf86c769e..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} //namespace - -CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage); -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *output_stage) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), - (bias != nullptr) ? bias->info() : nullptr, - output->info(), - output_stage)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - // Set the arguments to pass at compile time - auto min = output_stage->gemmlowp_min_bound; - auto max = output_stage->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h deleted file mode 100644 index abdf33ea43..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Add bias to final result if bias tensor is not a nullptr - * -# Shift the int32 accumulator by result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values: - * -# -to the [0..255] range and cast to QASYMM8. - * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. - * - */ -class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp deleted file mode 100644 index 55ae3ea6ca..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); - - if(output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); - } - return Status{}; -} - -Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - - if(output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); - } - return Status{}; -} -} // namespace - -ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() - : _input(), _output() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info); -} - -void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); - - _input = mtx_a; - _output = vector_sum_row; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - - std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - // This kernel does not need padding - Window win = calculate_max_window(*vector_sum_row->info(), Steps()); - ICLKernel::configure_internal(win); - - _config_id = kernel_name; - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - - return Status{}; -} - -void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); - Window slice_in = collapsed.first_slice_window_2D(); - Window slice_out = collapsed.first_slice_window_2D(); - - // Setup input slice. Its dimensions are increased in the cl kernel. - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} - -void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info); -} - -void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); - - _input = mtx_b; - _output = vector_sum_col; - - // Output auto initialization if not yet initialized - auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0))); - build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - - return Status{}; -} - -void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); - - Window slice_out = collapsed.first_slice_window_2D(); - Window slice_in = slice_out; - - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h deleted file mode 100644 index 237d8099b7..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; -struct GEMMLowpReductionKernelInfo; - -/** Common interface for all OpenCL reduction kernels */ -class ICLGEMMLowpReductionKernel : public ICLKernel -{ -public: - /** Constructor */ - ICLGEMMLowpReductionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default; - - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - -protected: - const ICLTensor *_input; - ICLTensor *_output; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp new file mode 100644 index 0000000000..ec0a3bf8e0 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m(); + const int n = gemm_info.n(); + const int k = gemm_info.k(); + + ARM_COMPUTE_UNUSED(m); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast(n)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast(k)); + if(gemm_info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast(m)); + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + + return Status{}; +} + +std::pair validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + bool reinterpret_dst_as_3d = (gemm_info.depth_output_gemm3d() != 0); + + Window win{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_dst_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_dst_as_3d) + { + reinterpret_dst_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + + TensorInfo tmp_info(*dst); + + if(reinterpret_dst_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // RHS matrix still needs padding on the X + AccessWindowStatic src1_access(src1, 0, 0, + ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), + src1->dimension(1)); + + window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + + // We still need padding on the X dimension for the RHS matrix + auto padding_info = get_padding_info({ src0, dst }); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + std::string kernel_name("gemmlowp_mm_native"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k()); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h new file mode 100644 index 0000000000..491c3e44df --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ +class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyNativeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Source tensor containing the RHS matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H*/ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp new file mode 100644 index 0000000000..44fda01ded --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace misc::shape_calculator; + +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m(); + const int n = gemm_info.n(); + const int k = gemm_info.k(); + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + + return Status{}; +} + +std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + + TensorInfo tmp_info(*dst); + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + return std::make_pair(Status{}, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + _k = gemm_info.k(); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + + // Check if we need to slide the matrix B + const unsigned int num_dimensionssrc0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); + + auto padding_info = get_padding_info({ src0, src1, dst }); + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); + + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m())); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + std::string kernel_name("gemmlowp_mm_reshaped_"); + kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; + kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k()); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg(idx++, static_cast(_k)); + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h new file mode 100644 index 0000000000..b99dec33af --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped + * + * @note The input matrices @p src0 and @p src1 must be reshaped through: + * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel + * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + */ +class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.transpose: false + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + * + * @note lhs_info.k0 must be equal to rhs_info.k0 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_output_as_3d{ false }; + unsigned int _k{ 1 }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H*/ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp new file mode 100644 index 0000000000..9d626936ff --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace misc::shape_calculator; + +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + + const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; + const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m; + const int n = gemm_info.n; + const int k = gemm_info.k; + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast(m)); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + if(output_stage.type == GEMMLowpOutputStageType::NONE) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + } + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); + + // Checks performed if the dst stage needs to be fused + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // If a_offset == 0, vector_sum_col can be a nullptr + if(gemm_info.a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(gemm_info.b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if mm result is a 3D reinterpretation + const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); + + if(expected_dst_shape.num_dimensions() > 1) + { + const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + TensorShape collapsed_dst_shape(expected_dst_shape); + collapsed_dst_shape.collapse_from(dst_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], + "vector_sum_row must have the same number of batches of dst tensor"); + + if(gemm_info.a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + + if(output_multipliers != nullptr && output_shifts != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + if(output_stage.is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); + } + } + } + return Status{}; +} + +std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +{ + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); + if(output_stage.type != GEMMLowpOutputStageType::NONE) + { + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + } + else + { + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32)); + } + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; + num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + if(gemm_info.a_offset != 0) + { + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); + } + // No access window needed for vector_sum_row + ARM_COMPUTE_UNUSED(vector_sum_row); + + if(bias != nullptr) + { + AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, bias_access); + } + + if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); + } + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + + auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; + const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + const int32_t a_offset = gemm_info.a_offset; + const int32_t b_offset = gemm_info.b_offset; + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _is_quantized_per_channel = output_stage.is_quantized_per_channel; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % internal_m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + + std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + kernel_name += "_fused_output_stage_fixedpoint"; + _fuse_output_stage = true; + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0 && vector_sum_col != nullptr) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); + build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + + const int min = output_stage.gemmlowp_min_bound; + const int max = output_stage.gemmlowp_max_bound; + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + build_opts.add_option_if(min != min_val.get(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if(max != max_val.get(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + idx++; + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + idx++; + } + + if(_fuse_output_stage) + { + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); + } + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h new file mode 100644 index 0000000000..9e52b38249 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped + * + * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported + */ +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. + * Only the following values are supported for LHS info: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * Only the following values are supported for RHS info: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 + * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 + * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. + * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _is_quantized_per_channel{ false }; + bool _fuse_output_stage{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..e491cca914 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + return Status{}; +} +} // namespace + +ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + + auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + std::string kernel_name("gemmlowp_offset_contribution"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name + "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + return Status{}; +} + +void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); + + const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_SRC_DST)); + + Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..d1712f4f4b --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class ClGemmLowpOffsetContributionKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp new file mode 100644 index 0000000000..1e2d7d7efe --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, + int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + if(output_stage.is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); + // Checks performed when output is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst); + } + + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); + + return Status{}; +} +} // namespace + +ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + + auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); + + const int min = output_stage.gemmlowp_min_bound; + const int max = output_stage.gemmlowp_max_bound; + + _is_quantized_per_channel = output_stage.is_quantized_per_channel; + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Auto initialize the output + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); + build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + build_opts.add_option_if((min > min_val.get()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < max_val.get()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + + std::string kernel_name("gemmlowp_offset_contribution"); + kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name + "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + return Status{}; +} + +void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_3D_tensor_argument(idx, dst, slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..977f2eac53 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution + * of matrix A and matrix B and performs the output stage defined by the output_stage argument + * + * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. + */ +class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionOutputStageKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info + * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _is_quantized_per_channel{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..8aec1654d9 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + // Set the arguments to pass at compile time + auto min = info->gemmlowp_min_bound; + auto max = info->gemmlowp_max_bound; + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); + build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + // Create src window + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Setup bias slice + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..c935aa7ec4 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp new file mode 100644 index 0000000000..9b488ff329 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) + || info->gemmlowp_min_bound > info->gemmlowp_max_bound); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + auto min = info->gemmlowp_min_bound; + auto max = info->gemmlowp_max_bound; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); + build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + // Create input window + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Setup bias slice + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h new file mode 100644 index 0000000000..eff8c4b2be --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Requantize + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFloatKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp new file mode 100644 index 0000000000..9a25973a93 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) + || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} //namespace + +ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + // Set the arguments to pass at compile time + auto min = output_stage->gemmlowp_min_bound; + auto max = output_stage->gemmlowp_max_bound; + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h new file mode 100644 index 0000000000..c5374755c8 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * -# -to the [0..255] range and cast to QASYMM8. + * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] output_stage GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp new file mode 100644 index 0000000000..b4886805fb --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + } + return Status{}; +} + +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + } + return Status{}; +} +} // namespace + +IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); + + auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type())); + build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); + + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + + std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + // This kernel does not need padding + Window win = calculate_max_window(*vector_sum_row, Steps()); + ICLKernel::configure_internal(win); + + _config_id = kernel_name; + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + + return Status{}; +} + +void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); + Window slice_in = collapsed.first_slice_window_2D(); + Window slice_out = collapsed.first_slice_window_2D(); + + // Setup input slice. Its dimensions are increased in the cl kernel. + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} + +void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); + + auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); + build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type())); + build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + + return Status{}; +} + +void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); + + Window slice_out = collapsed.first_slice_window_2D(); + Window slice_in = slice_out; + + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h new file mode 100644 index 0000000000..11188ed062 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Common interface for all OpenCL reduction kernels */ +class IClGemmLowpReductionKernel : public IClKernel +{ +public: + IClGemmLowpReductionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 31c8908270..bc9a3056e8 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -29,11 +29,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClTransposeKernel.h" #include "support/Cast.h" diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 188f3b8819..cef8ad5a0d 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -31,11 +31,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLCol2ImKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" #include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/helpers/AutoConfiguration.h" diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index d5d1b5f41e..bab29a5095 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -30,11 +30,6 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" #include "src/core/CL/kernels/CLWeightsReshapeKernel.h" diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 3be09581bd..6c64731f73 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -34,12 +34,12 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClCastKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" @@ -49,6 +49,7 @@ namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::cl_gemm; +using namespace arm_compute::opencl::kernels; namespace { @@ -95,7 +96,7 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons // NOTE: This assumes: // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(CLGEMMLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) { return false; } @@ -127,15 +128,15 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } // Validate mm kernel // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info // NOTE: This assumes: - // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). - // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). GEMMKernelInfo gemm_kernel_info; gemm_kernel_info.m = m; gemm_kernel_info.n = n; @@ -147,7 +148,7 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) { return false; } @@ -189,14 +190,14 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _weights_to_qasymm8(std::make_unique()), - _mm_native_kernel(std::make_unique()), - _mm_reshaped_only_rhs_kernel(std::make_unique()), - _mtx_b_reshape_kernel(std::make_unique()), - _mtx_a_reduction_kernel(std::make_unique()), - _mtx_b_reduction_kernel(std::make_unique()), - _offset_contribution_kernel(std::make_unique()), - _offset_contribution_output_stage_kernel(std::make_unique()), + _weights_to_qasymm8(std::make_unique()), + _mm_native_kernel(std::make_unique()), + _mm_reshaped_only_rhs_kernel(std::make_unique()), + _mtx_b_reshape_kernel(std::make_unique()), + _mtx_a_reduction_kernel(std::make_unique()), + _mtx_b_reduction_kernel(std::make_unique()), + _offset_contribution_kernel(std::make_unique()), + _offset_contribution_output_stage_kernel(std::make_unique()), _qasymm8_weights(), _vector_sum_col(), _vector_sum_row(), @@ -206,6 +207,7 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->quantization_info().uniform().offset; _matrix_a = a; + _c = c; _output = output; _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type()) @@ -309,7 +312,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), _vector_sum_col.info(), reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -320,7 +323,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); + _mtx_a_reduction_kernel->configure(compile_context, a->info(), _vector_sum_row.info(), reduction_info); } GEMMKernelInfo gemm_kernel_info; @@ -356,8 +359,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info, _a_offset == 0 ? nullptr : _vector_sum_col.info(), + _b_offset == 0 ? nullptr : _vector_sum_row.info(), c != nullptr ? c->info() : nullptr, _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info()); } else { @@ -367,7 +370,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), gemm_kernel_info); } else { @@ -377,11 +380,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _matrix_a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : matrix_b->info(), reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); + _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), lhs_info, rhs_info, reshape_info); - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, - a->info()->dimension(0), - _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _offset_contribution_output_stage_kernel->configure(compile_context, _mm_result_s32.info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(), + c != nullptr ? c->info() : nullptr, output->info(), a->info()->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, + _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info()); _mm_result_s32.allocator()->allocate(); } } @@ -402,7 +405,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info); } else { @@ -412,12 +415,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, reshape_info); + _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), lhs_info, rhs_info, reshape_info); } // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, - _b_offset); + _offset_contribution_kernel->configure(compile_context, output->info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(), + c != nullptr ? c->info() : nullptr, a->info()->dimension(0), _a_offset, _b_offset); } // Allocate tensors @@ -480,7 +483,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso { b_offset = -128; weights_info.set_data_type(DataType::QASYMM8); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); + ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); } const ITensorInfo *matrix_b_info = &weights_info; if(reshape_matrix_b) @@ -496,7 +499,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } TensorInfo info_vector_sum_col{}; @@ -509,7 +512,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -518,7 +521,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); } GEMMKernelInfo gemm_kernel_info; @@ -543,7 +546,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso gemm_kernel_info.output_stage = gemmlowp_output_stage; if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -560,7 +563,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); } else { @@ -575,11 +578,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -595,7 +598,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso if(reshape_matrix_b) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); } else { @@ -606,13 +609,13 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } if(output->total_size() != 0) { // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -629,48 +632,83 @@ void CLGEMMLowpMatrixMultiplyCore::run() MemoryGroupResourceScope scope_mg(_memory_group); + const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : _original_b; + if(_is_gemm_reshaped) { + matrix_b = &_tmp_b; if(!_reshape_b_only_on_first_run) { // Run reshape matrix B - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); - CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false); + ITensorPack mtx_b_reshape_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_tmp_b } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false); + ITensorPack mtx_a_red_pack = { { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_DST, &_vector_sum_row } }; + CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); } // Run matrix multiply if(_is_gemm_reshaped) { - CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false); + ITensorPack gemm_reshaped_pack; + if(_run_offset_contribution) + { + gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_output_stage ? &_mm_result_s32 : _output } }); + } + else + { + gemm_reshaped_pack = ITensorPack( + { + { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : &_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : &_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output }, + }); + } + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); } else { - CLScheduler::get().enqueue(*_mm_native_kernel, false); + ITensorPack gemm_native_pack = + { + { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_offset_contribution ? _output :&_mm_result_s32 } + }; + CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); } if(_run_output_stage) { // Run offset contribution/output stage kernel - CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true); + ITensorPack output_stage_pack = + { + { TensorType::ACL_SRC, &_mm_result_s32 }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output }, + }; + CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); } if(_run_offset_contribution) { // Run offset contribution kernel - CLScheduler::get().enqueue(*_offset_contribution_kernel, true); + ITensorPack offset_contrib_pack = + { + { TensorType::ACL_SRC_DST, _output }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); } } @@ -691,9 +729,11 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() // Run reshape kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); + ITensorPack mtx_b_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_tmp_b } + }; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); _original_b->mark_as_unused(); } @@ -702,7 +742,12 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() if(_a_offset != 0 && _reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index be452aaf3d..e230e8f2e6 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,111 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" #include namespace arm_compute { -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() + : _kernel(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr) { - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) { @@ -140,26 +52,30 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _input = input; + _bias = bias; + _output = output; + switch(info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: { - auto k = std::make_unique(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } @@ -176,13 +92,19 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn switch(info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN: - return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); default: return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); } } + +void CLGEMMLowpOutputStage::run() +{ + ITensorPack pack{ { ACL_SRC, _input }, { ACL_BIAS, _bias }, { ACL_DST, _output } }; + CLScheduler::get().enqueue_op(*_kernel, pack, true); +} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 85d13c246e..9754bdcb82 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -30,11 +30,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index a44dcd2e24..589523a3c3 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -28,11 +28,6 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include @@ -179,7 +174,13 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); @@ -386,7 +387,12 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index fcf5b9d2a4..5df895a91c 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -31,17 +31,14 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "src/core/helpers/WindowHelpers.h" namespace arm_compute { using namespace arm_compute::utils::info_helpers; +using namespace arm_compute::opencl::kernels; namespace { Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, @@ -93,15 +90,15 @@ void CLQLSTMLayer::TensorCopyKernel::run() } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) - : _input_to_input_reduction(std::make_unique()), - _recurrent_to_input_reduction(std::make_unique()), - _input_to_forget_reduction(std::make_unique()), - _recurrent_to_forget_reduction(std::make_unique()), - _input_to_cell_reduction(std::make_unique()), - _recurrent_to_cell_reduction(std::make_unique()), - _input_to_output_reduction(std::make_unique()), - _recurrent_to_output_reduction(std::make_unique()), - _projection_reduction(std::make_unique()), + : _input_to_input_reduction(std::make_unique()), + _recurrent_to_input_reduction(std::make_unique()), + _input_to_forget_reduction(std::make_unique()), + _recurrent_to_forget_reduction(std::make_unique()), + _input_to_cell_reduction(std::make_unique()), + _recurrent_to_cell_reduction(std::make_unique()), + _input_to_output_reduction(std::make_unique()), + _recurrent_to_output_reduction(std::make_unique()), + _projection_reduction(std::make_unique()), _layer_norms(), _copy_output() { @@ -247,18 +244,22 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); if(_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); if(_projection_bias != nullptr) { _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); @@ -677,19 +678,19 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); if(!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); if(lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); if(lstm_params.projection_bias() != nullptr) @@ -1128,8 +1129,12 @@ void CLQLSTMLayer::prepare() { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_input_reduction); - CLScheduler::get().enqueue(*_recurrent_to_input_reduction); + + ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); + + ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1144,17 +1149,30 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_forget_reduction); - CLScheduler::get().enqueue(*_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(*_input_to_cell_reduction); - CLScheduler::get().enqueue(*_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(*_input_to_output_reduction); - CLScheduler::get().enqueue(*_recurrent_to_output_reduction); + + ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); + + ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); + + ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); + + ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); + + ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); + + ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); if(_has_projection) { _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_projection_reduction); + ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); if(_projection_bias != nullptr) { _projection_bias_add.run(); diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 755fa40121..20deef4edf 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -29,11 +29,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" namespace arm_compute { diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp index 1c7446f653..52adb94c83 100644 --- a/tests/validation/CL/GEMMLowp.cpp +++ b/tests/validation/CL/GEMMLowp.cpp @@ -213,140 +213,6 @@ TEST_SUITE_END() // BoundedReLu TEST_SUITE_END() // QASYMM8_SIGNED TEST_SUITE_END() // QuantizeDownInt32Scale -TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint) - -TEST_SUITE(QASYMM8) - -const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true }); -using CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture; - -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QASYMM8 -TEST_SUITE(QASYMM8_SIGNED) -const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128, -126) * framework::dataset::make("max", 110, 112) * framework::dataset::make("addBias", { false, true }); -using CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture; - -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QASYMM8_SIGNED -TEST_SUITE(QSYMM16) - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823, - 1073741825) - * framework::dataset::make("result_shift", -3, - -2) - * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, - 254601602) - * framework::dataset::make("result_shift", -3, - -1) - * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true }); - -using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture; - -TEST_SUITE(NoRelu) -TEST_SUITE(MultSmallerEq1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultSmallerEq1 -TEST_SUITE(MultGreater1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultGreater1 -TEST_SUITE_END() // NoRelu -TEST_SUITE(BoundedReLu) -TEST_SUITE(MultSmallerEq1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultSmallerEq1 -TEST_SUITE(MultGreater1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultGreater1 -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QSYMM16 -TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint - TEST_SUITE(QuantizeDownInt32ScaleByFloat) TEST_SUITE(QASYMM8) diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp index 1057af95f2..d733a00296 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" #include "tests/framework/Asserts.h" @@ -41,7 +41,7 @@ namespace validation using namespace arm_compute::misc::shape_calculator; // Create function for CLGEMMMatrixMultiplyNativeKernel -using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeFunction; +using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeOperator; // Fixture for CLGEMMLowpMatrixMultiplyNative using CLGEMMLowpMatrixMultiplyNativeFixture = GEMMLowpMatrixMultiplyNativeValidationFixture; diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp index 68a7d055ad..3baa39bffc 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "tests/CL/CLAccessor.h" @@ -49,7 +49,7 @@ using CLGEMMReshapeLHSMatrix = CLSynthetizeOperator; // Create function for CLGEMMLowpMatrixMultiplyReshapedKernel -using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeFunction; +using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeOperator; // Fixture for CLGEMMLowpMatrixMultiplyReshaped using CLGEMMLowpMatrixMultiplyReshapedFixture = GEMMLowpMatrixMultiplyReshapedValidationFixture; diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp index 43b86b51e8..1283713c4d 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp @@ -25,7 +25,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" @@ -49,7 +49,7 @@ using namespace arm_compute::misc::shape_calculator; using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator; // Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeFunction; +using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator; // Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFixture = GEMMLowpMatrixMultiplyReshapedOnlyRHSValidationFixture; @@ -157,7 +157,7 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned // Create and configure function CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm; - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); } } // namespace diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index ab9d35de0f..5e2154592e 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -959,7 +959,7 @@ protected: GEMMFunctionType gemm; reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -988,7 +988,8 @@ protected: reshape_lhs.run(reshape_lhs_pack); ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1113,7 +1114,7 @@ protected: GEMMFunctionType gemm; reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1142,7 +1143,8 @@ protected: reshape_lhs.run(reshape_lhs_pack); ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1266,7 +1268,7 @@ protected: ReshapeRHSOperatorType reshape_rhs; GEMMFunctionType gemm; reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1291,7 +1293,8 @@ protected: // Compute GEMM ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1412,7 +1415,7 @@ protected: ReshapeRHSOperatorType reshape_rhs; GEMMFunctionType gemm; reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1437,7 +1440,8 @@ protected: // Compute GEMM ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1527,7 +1531,7 @@ protected: // Create and configure function GEMMFunctionType gemm; - gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); + gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1548,7 +1552,8 @@ protected: fill(AccessorType(rhs), 1); // Compute GEMM - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1624,7 +1629,7 @@ protected: // Create and configure function GEMMFunctionType gemm; - gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); + gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1645,7 +1650,8 @@ protected: fill(AccessorType(rhs), 1); // Compute GEMM - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } -- cgit v1.2.1