diff options
52 files changed, 1841 insertions, 2574 deletions
diff --git a/Android.bp b/Android.bp index 6250539d02..822afc1eca 100644 --- a/Android.bp +++ b/Android.bp @@ -96,15 +96,6 @@ cc_library_static { "src/core/CL/kernels/CLFFTScaleKernel.cpp", "src/core/CL/kernels/CLFillBorderKernel.cpp", "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp", - "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp", "src/core/CL/kernels/CLGatherKernel.cpp", "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp", "src/core/CL/kernels/CLIm2ColKernel.cpp", @@ -351,6 +342,15 @@ cc_library_static { "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp", "src/core/gpu/cl/kernels/ClFillKernel.cpp", "src/core/gpu/cl/kernels/ClFloorKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp", "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp", diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index 92ece460dc..1983344075 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -39,15 +39,24 @@ enum TensorType : int32_t { ACL_UNKNOWN = -1, ACL_SRC_DST = 0, - ACL_SRC = 0, - ACL_SRC_0 = 0, - ACL_SRC_1 = 1, - ACL_SRC_2 = 2, - ACL_DST = 30, - ACL_DST_0 = 30, - ACL_DST_1 = 31, - ACL_DST_2 = 32, - ACL_BIAS = ACL_SRC_2, + + // Src + ACL_SRC = 0, + ACL_SRC_0 = 0, + ACL_SRC_1 = 1, + ACL_SRC_2 = 2, + ACL_SRC_3 = 3, + ACL_SRC_4 = 4, + ACL_SRC_5 = 5, + ACL_SRC_6 = 6, + + // Dst + ACL_DST = 30, + ACL_DST_0 = 30, + ACL_DST_1 = 31, + ACL_DST_2 = 32, + + // Aux ACL_INT = 50, ACL_INT_0 = 50, ACL_INT_1 = 51, @@ -56,7 +65,17 @@ enum TensorType : int32_t ACL_INT_4 = 54, ACL_SRC_VEC = 256, ACL_DST_VEC = 512, - ACL_INT_VEC = 1024 + ACL_INT_VEC = 1024, + + // Aliasing Types + // Conv etc + ACL_BIAS = ACL_SRC_2, + + // Gemm + ACL_VEC_ROW_SUM = ACL_SRC_3, + ACL_VEC_COL_SUM = ACL_SRC_4, + ACL_SHIFTS = ACL_SRC_5, + ACL_MULTIPLIERS = ACL_SRC_6, }; namespace experimental diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index 3d2dbdb104..e62db8e644 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -34,17 +34,17 @@ class CLCompileContext; class IMemoryManager; class ICLTensor; class ITensorInfo; -class CLGEMMLowpMatrixMultiplyNativeKernel; -class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel; -class CLGEMMLowpOffsetContributionKernel; -class CLGEMMLowpOffsetContributionOutputStageKernel; -class CLGEMMLowpMatrixAReductionKernel; -class CLGEMMLowpMatrixBReductionKernel; namespace opencl { namespace kernels { class ClGemmReshapeRhsMatrixKernel; +class ClGemmLowpMatrixMultiplyNativeKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel; +class ClGemmLowpOffsetContributionKernel; +class ClGemmLowpOffsetContributionOutputStageKernel; +class ClGemmLowpMatrixAReductionKernel; +class ClGemmLowpMatrixBReductionKernel; } // namespace kernels } // namespace opencl @@ -150,14 +150,14 @@ private: MemoryGroup _memory_group; // Kernels used - std::unique_ptr<opencl::kernels::ClCastKernel> _weights_to_qasymm8; - std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel> _mm_native_kernel; - std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel; - std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; - std::unique_ptr<CLGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; - std::unique_ptr<CLGEMMLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<opencl::kernels::ClCastKernel> _weights_to_qasymm8; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<opencl::kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<opencl::kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; // Temporary tensors CLTensor _qasymm8_weights; @@ -171,7 +171,8 @@ private: // Tensor pointers const ICLTensor *_matrix_a; const ICLTensor *_original_b; - const ICLTensor *_output; + const ICLTensor *_c; + ICLTensor *_output; int32_t _a_offset; int32_t _b_offset; diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h index a60992a0f4..e85f2db8a9 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -45,247 +45,28 @@ class ICLTensor; class ITensorInfo; struct GEMMLowpOutputStageInfo; -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following OpenCL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; - -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following OpenCL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0 - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; - -/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL. - * - * CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: - * - * result_fixedpoint_multiplier, result_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following CL kernels: - * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction -{ -public: - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(), - int max = std::numeric_limits<int32_t>::max()); - /** Initialise the kernel's inputs, output - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint - * - * @param[in] input Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor info. Data type supported: QSYMM16 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; /** Basic function to execute GEMMLowpQuantizeDown kernels on CL. * * This function calls the following CL kernels: * - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel */ -class CLGEMMLowpOutputStage : public ICLSimpleFunction +class CLGEMMLowpOutputStage : public IFunction { public: + CLGEMMLowpOutputStage(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete; + /** Default move constructor */ + CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete; + /** Default move assignment operator */ + CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&); + /** Default destructor */ + ~CLGEMMLowpOutputStage(); /** Initialise the kernel's inputs, output * * Valid data layouts: @@ -315,7 +96,7 @@ public: * @param[in] info GEMMLowp output stage metadata. */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel * * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. @@ -326,6 +107,15 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + std::unique_ptr<ICLKernel> _kernel; + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */ diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h index 2ef7427a5a..9c004b85d0 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h @@ -47,16 +47,16 @@ class ICLTensor; * * This function calls the following CL functions/kernels: * - * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref CLTranspose Matrix transpose - * -# @ref CLConcatenateLayer Tensor concatenation - * -# @ref CLActivationLayer Activation functions (tanh and logistic) - * -# @ref CLArithmeticAddition Elementwise addition - * -# @ref CLPixelWiseMultiplication Elementwise multiplication - * -# @ref CLSlice Tensor slicing - * -# @ref CLDequantizationLayer Dequantize into float - * -# @ref CLQuantizationLayer Quantize from float + * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers + * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref CLTranspose Matrix transpose + * -# @ref CLConcatenateLayer Tensor concatenation + * -# @ref CLActivationLayer Activation functions (tanh and logistic) + * -# @ref CLArithmeticAddition Elementwise addition + * -# @ref CLPixelWiseMultiplication Elementwise multiplication + * -# @ref CLSlice Tensor slicing + * -# @ref CLDequantizationLayer Dequantize into float + * -# @ref CLQuantizationLayer Quantize from float * */ class CLLSTMLayerQuantized : public IFunction { @@ -170,30 +170,30 @@ private: MemoryGroup _memory_group; // Functions used - CLGEMMLowpMatrixMultiplyCore _gemmlowp; - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage; - CLTranspose _transpose_weights; - CLConcatenateLayer _concat_input_weights; - CLConcatenateLayer _concat_recurrent_weights; - CLConcatenateLayer _concat_weights; - CLConcatenateLayer _concat_inputs; - CLConcatenateLayer _concat_bias; - CLActivationLayer _sigmoid_forget_gate; - CLActivationLayer _sigmoid_input_gate; - CLActivationLayer _sigmoid_output_gate; - CLActivationLayer _tanh_modulation_gate; - CLActivationLayer _tanh_output_state; - CLArithmeticAddition _add_cell_state_tmps; - CLArithmeticAddition _add2; - CLPixelWiseMultiplication _mul_forget_gate_cell_state; - CLPixelWiseMultiplication _mul_input_gate_input_mod_gate; - CLPixelWiseMultiplication _mul_output_state_tmp_output_gate; - CLSlice _slice_input_tensor; - CLSlice _slice_forget_tensor; - CLSlice _slice_cell_tensor; - CLSlice _slice_output_tensor; - CLDequantizationLayer _dequantize; - CLQuantizationLayer _quantize; + CLGEMMLowpMatrixMultiplyCore _gemmlowp; + CLGEMMLowpOutputStage _output_stage; + CLTranspose _transpose_weights; + CLConcatenateLayer _concat_input_weights; + CLConcatenateLayer _concat_recurrent_weights; + CLConcatenateLayer _concat_weights; + CLConcatenateLayer _concat_inputs; + CLConcatenateLayer _concat_bias; + CLActivationLayer _sigmoid_forget_gate; + CLActivationLayer _sigmoid_input_gate; + CLActivationLayer _sigmoid_output_gate; + CLActivationLayer _tanh_modulation_gate; + CLActivationLayer _tanh_output_state; + CLArithmeticAddition _add_cell_state_tmps; + CLArithmeticAddition _add2; + CLPixelWiseMultiplication _mul_forget_gate_cell_state; + CLPixelWiseMultiplication _mul_input_gate_input_mod_gate; + CLPixelWiseMultiplication _mul_output_state_tmp_output_gate; + CLSlice _slice_input_tensor; + CLSlice _slice_forget_tensor; + CLSlice _slice_cell_tensor; + CLSlice _slice_output_tensor; + CLDequantizationLayer _dequantize; + CLQuantizationLayer _quantize; // Tensor pointers const ICLTensor *_input_to_input_weights; diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h index bd00d56468..1b0b759d74 100644 --- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h @@ -40,9 +40,15 @@ namespace arm_compute // Forward declarations class CLCompileContext; class ICLTensor; -class CLGEMMLowpMatrixAReductionKernel; class CLQLSTMLayerNormalizationKernel; class ITensorInfo; +namespace opencl +{ +namespace kernels +{ +class ClGemmLowpMatrixAReductionKernel; +} // namespace kernels +} // namespace opencl /** Basic function to run @ref CLQLSTMLayer * @@ -52,8 +58,8 @@ class ITensorInfo; * -# @ref CLCopy Copy function for copying output_state_out to output * -# @ref CLArithmeticAddition Elementwise addition and subtraction * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref CLGEMMLowpMatrixAReductionKernel For precomputing effective biases to use + * -# @ref CLGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel For precomputing effective biases to use * -# @ref CLPixelWiseMultiplication Elementwise multiplication * -# @ref CLTranspose Transpose function for reshaping the weights * */ @@ -297,70 +303,70 @@ private: }; // Functions used - CLTranspose _transpose_input_to_forget_weights{}; - CLTranspose _transpose_input_to_cell_weights{}; - CLTranspose _transpose_input_to_output_weights{}; - CLTranspose _transpose_input_to_input_weights{}; - CLTranspose _transpose_recurrent_to_forget_weights{}; - CLTranspose _transpose_recurrent_to_cell_weights{}; - CLTranspose _transpose_recurrent_to_output_weights{}; - CLTranspose _transpose_recurrent_to_input_weights{}; - CLTranspose _transpose_projection_weights{}; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction; - std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction; - CLArithmeticAddition _projection_bias_add{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; - CLGEMMLowpOutputStage _input_to_forget_outstage{}; - CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; - CLGEMMLowpOutputStage _cell_to_forget_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_forget{}; - CLArithmeticAddition _accumulate_cell_forget{}; - CLActivationLayer _forget_gate_sigmoid{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; - CLGEMMLowpOutputStage _input_to_cell_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; - CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_modulation{}; - CLActivationLayer _cell_gate_tanh{}; - CLArithmeticSubtraction _input_gate_sub{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; - CLGEMMLowpOutputStage _input_to_input_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; - CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_input{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; - CLGEMMLowpOutputStage _cell_to_input_outstage{}; - CLArithmeticAddition _accumulate_cell_input{}; - CLActivationLayer _input_gate_sigmoid{}; - CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; - CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; - CLArithmeticAddition _add_forget_cell{}; - CLActivationLayer _cell_clip{}; - CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; - CLGEMMLowpOutputStage _input_to_output_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; - CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; - CLArithmeticAddition _accumulate_input_recurrent_output{}; - CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; - CLGEMMLowpOutputStage _cell_to_output_outstage{}; - CLArithmeticAddition _accumulate_cell_to_output{}; - CLActivationLayer _output_gate_sigmoid{}; - CLActivationLayer _hidden_tanh{}; - CLPixelWiseMultiplication _pixelwise_mul_hidden{}; - CLGEMMLowpOutputStage _hidden_outstage{}; - CLGEMMLowpMatrixMultiplyCore _mm_projection{}; - CLGEMMLowpOutputStage _projection_outstage{}; - CLArithmeticAddition _accumulate_projection{}; - CLActivationLayer _projection_clip{}; + CLTranspose _transpose_input_to_forget_weights{}; + CLTranspose _transpose_input_to_cell_weights{}; + CLTranspose _transpose_input_to_output_weights{}; + CLTranspose _transpose_input_to_input_weights{}; + CLTranspose _transpose_recurrent_to_forget_weights{}; + CLTranspose _transpose_recurrent_to_cell_weights{}; + CLTranspose _transpose_recurrent_to_output_weights{}; + CLTranspose _transpose_recurrent_to_input_weights{}; + CLTranspose _transpose_projection_weights{}; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction; + std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction; + CLArithmeticAddition _projection_bias_add{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_forget{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_forget{}; + CLGEMMLowpOutputStage _input_to_forget_outstage{}; + CLGEMMLowpOutputStage _recurrent_to_forget_outstage{}; + CLGEMMLowpOutputStage _cell_to_forget_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_forget{}; + CLArithmeticAddition _accumulate_cell_forget{}; + CLActivationLayer _forget_gate_sigmoid{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_cell{}; + CLGEMMLowpOutputStage _input_to_cell_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell{}; + CLGEMMLowpOutputStage _recurrent_to_cell_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_modulation{}; + CLActivationLayer _cell_gate_tanh{}; + CLArithmeticSubtraction _input_gate_sub{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_input{}; + CLGEMMLowpOutputStage _input_to_input_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input{}; + CLGEMMLowpOutputStage _recurrent_to_input_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_input{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_input{}; + CLGEMMLowpOutputStage _cell_to_input_outstage{}; + CLArithmeticAddition _accumulate_cell_input{}; + CLActivationLayer _input_gate_sigmoid{}; + CLPixelWiseMultiplication _pixelwise_mul_forget_cell{}; + CLPixelWiseMultiplication _pixelwise_mul_input_cell{}; + CLArithmeticAddition _add_forget_cell{}; + CLActivationLayer _cell_clip{}; + CLGEMMLowpMatrixMultiplyCore _mm_input_to_output{}; + CLGEMMLowpOutputStage _input_to_output_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output{}; + CLGEMMLowpOutputStage _recurrent_to_output_outstage{}; + CLArithmeticAddition _accumulate_input_recurrent_output{}; + CLPixelWiseMultiplication _pixelwise_mul_cell_to_output{}; + CLGEMMLowpOutputStage _cell_to_output_outstage{}; + CLArithmeticAddition _accumulate_cell_to_output{}; + CLActivationLayer _output_gate_sigmoid{}; + CLActivationLayer _hidden_tanh{}; + CLPixelWiseMultiplication _pixelwise_mul_hidden{}; + CLGEMMLowpOutputStage _hidden_outstage{}; + CLGEMMLowpMatrixMultiplyCore _mm_projection{}; + CLGEMMLowpOutputStage _projection_outstage{}; + CLArithmeticAddition _accumulate_projection{}; + CLActivationLayer _projection_clip{}; std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms; CLCopy _copy_output; diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index eb4c280295..0c8b57ff9f 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -227,7 +227,7 @@ v20.11 Public major release - @ref CLLogSoftmaxLayer - GCSoftmaxLayer - New OpenCL kernels / functions: - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - @ref CLLogicalNot - @ref CLLogicalAnd - @ref CLLogicalOr @@ -260,13 +260,13 @@ v20.11 Public major release - @ref CLBatchNormalizationLayerKernel - CLPoolingLayerKernel - CLWinogradInputTransformKernel - - @ref CLGEMMLowpMatrixMultiplyNativeKernel - - @ref CLGEMMLowpMatrixAReductionKernel - - @ref CLGEMMLowpMatrixBReductionKernel - - @ref CLGEMMLowpOffsetContributionOutputStageKernel - - @ref CLGEMMLowpOffsetContributionKernel + - CLGEMMLowpMatrixMultiplyNativeKernel + - CLGEMMLowpMatrixAReductionKernel + - CLGEMMLowpMatrixBReductionKernel + - CLGEMMLowpOffsetContributionOutputStageKernel + - CLGEMMLowpOffsetContributionKernel - CLWinogradOutputTransformKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel - @ref CLFuseBatchNormalizationKernel - @ref CLDepthwiseConvolutionLayerNativeKernel - CLDepthConvertLayerKernel @@ -281,11 +281,11 @@ v20.11 Public major release - CLLogits1DNormKernel - CLHeightConcatenateLayerKernel - CLGEMMMatrixMultiplyKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpQuantizeDownInt32ScaleKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLDepthConcatenateLayerKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - Removed OpenCL kernels / functions: - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel @@ -596,9 +596,9 @@ v20.05 Public major release - @ref CLDeconvolutionLayer - @ref CLDirectDeconvolutionLayer - @ref CLGEMMDeconvolutionLayer - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpQuantizeDownInt32ScaleKernel + - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - @ref CLReductionOperation - @ref CLReduceMean - @ref NEScale @@ -655,9 +655,9 @@ v20.02 Public major release - @ref CLDepthwiseConvolutionLayer - CLDepthwiseConvolutionLayer3x3 - @ref CLGEMMConvolutionLayer - - @ref CLGEMMLowpMatrixMultiplyCore - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - - @ref CLGEMMLowpMatrixMultiplyNativeKernel + - CLGEMMLowpMatrixMultiplyCore + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpMatrixMultiplyNativeKernel - @ref NEActivationLayer - NEComparisonOperationKernel - @ref NEConvolutionLayer @@ -680,7 +680,7 @@ v20.02 Public major release - @ref NESplit - New OpenCL kernels / functions: - @ref CLFill - - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint + - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - New Arm® Neon™ kernels / functions: - @ref NEFill - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint @@ -861,7 +861,7 @@ v19.05 Public major release - @ref CLFFTDigitReverseKernel - @ref CLFFTRadixStageKernel - @ref CLFFTScaleKernel - - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel + - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel - CLHeightConcatenateLayerKernel - @ref CLDirectDeconvolutionLayer @@ -953,7 +953,7 @@ v19.02 Public major release - @ref CLRangeKernel / @ref CLRange - @ref CLUnstack - @ref CLGatherKernel / @ref CLGather - - @ref CLGEMMLowpMatrixMultiplyReshapedKernel + - CLGEMMLowpMatrixMultiplyReshapedKernel - New CPP kernels / functions: - @ref CPPDetectionOutputLayer - @ref CPPTopKV / @ref CPPTopKVKernel @@ -1247,8 +1247,8 @@ v17.12 Public major release - NEWinogradLayer / NEWinogradLayerKernel - New OpenCL kernels / functions - - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore - - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint + - CLGEMMLowpOffsetContributionKernel / CLGEMMLowpMatrixAReductionKernel / CLGEMMLowpMatrixBReductionKernel / CLGEMMLowpMatrixMultiplyCore + - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - New graph nodes for Arm® Neon™ and OpenCL - graph::BranchLayer diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp index 3d3f7fef1e..3c8976fddd 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp @@ -33,7 +33,7 @@ #include "arm_compute/runtime/CL/CLTuner.h" #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" #include "utils/Utils.h" @@ -168,8 +168,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; -using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>; +using ClGemmReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; +using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedKernel>; class CLGEMMLowpMatrixMultiplyReshapedExample : public Example { @@ -269,20 +269,20 @@ public: // Validate argments if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) { - std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl; return false; } if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl; return false; } // Configure functions reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info); // Allocate tensors lhs.allocator()->allocate(); @@ -298,7 +298,8 @@ public: ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } }); reshape_lhs.run(reshape_lsh_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -315,8 +316,8 @@ private: CLTensor rhs_reshaped{}; CLTensor dst{}; CLTuner tuner{}; - CLGEMMReshapeLHSMatrix reshape_lhs{}; - CLGEMMLowpMatrixMultiplyReshaped gemm{}; + ClGemmReshapeLHSMatrix reshape_lhs{}; + ClGemmLowpMatrixMultiplyReshaped gemm{}; }; /** Main test program for gemmlowp reshaped diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index ca7b7a5f04..15c1b86c61 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -35,8 +35,8 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "tests/CL/Helper.h" #include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" @@ -47,6 +47,7 @@ using namespace arm_compute; using namespace utils; +using namespace arm_compute::opencl::kernels; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -146,8 +147,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>; -using CLGEMMLowpMatrixAReduction = test::CLSynthetizeFunction<CLGEMMLowpMatrixAReductionKernel>; +using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>; +using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>; class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example { @@ -289,7 +290,7 @@ public: const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32); vector_sum_row.allocator()->init(info_vector_sum_row); - mtx_a_reduction = std::make_unique<CLGEMMLowpMatrixAReduction>(); + mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>(); if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) { @@ -297,7 +298,7 @@ public: return false; } - mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{}); + mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}); } // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 if(gemm_info.a_offset != 0) @@ -311,12 +312,14 @@ public: if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info())) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl; return false; } // Configure function - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), + bias.info(), dst_multipliers.info(), dst_shifts.info()); // Allocate tensors lhs.allocator()->allocate(); @@ -335,9 +338,12 @@ public: { if(mtx_a_reduction != nullptr) { - mtx_a_reduction->run(); + ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } }); + mtx_a_reduction->run(red_pack); } - gemm.run(); + + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -358,8 +364,8 @@ private: CLTensor dst_multipliers{}; CLTensor dst_shifts{}; CLTuner tuner{}; - CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm{}; - std::unique_ptr<CLGEMMLowpMatrixAReduction> mtx_a_reduction{ nullptr }; + ClGemmLowpMatrixMultiplyReshapedOnlyRhs gemm{}; + std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{ nullptr }; }; /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint diff --git a/filelist.json b/filelist.json index bde361f167..8a53829c5b 100644 --- a/filelist.json +++ b/filelist.json @@ -229,6 +229,21 @@ ] } }, + "GEMMLowp": { + "files": { + "kernel": [ + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" + ] + } + }, "Mul": { "files": { "operator": [ @@ -420,69 +435,6 @@ ] } }, - "GEMMLowpMatrixMultiplyNative": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp" - ] - } - }, - "GEMMLowpMatrixMultiplyReshaped": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp" - ] - } - }, - "GEMMLowpMatrixMultiplyReshapedOnlyRHS": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp" - ] - } - }, - "GEMMLowpOffsetContribution": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp" - ] - } - }, - "GEMMLowpOffsetContributionOutputStage": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32ScaleByFixedPoint": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32ScaleByFloat": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp" - ] - } - }, - "GEMMLowpQuantizeDownInt32Scale": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp" - ] - } - }, - "GEMMLowpReduction": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp" - ] - } - }, "InstanceNormalization": { "files": { "kernel": [ diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h index c380a9cc68..09f8109445 100644 --- a/src/core/CL/CLKernels.h +++ b/src/core/CL/CLKernels.h @@ -42,15 +42,6 @@ #include "src/core/CL/kernels/CLFFTScaleKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLGatherKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h deleted file mode 100644 index 125f0c6948..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ -class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyNativeKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS matrix. Data type supported: same as @p input0 - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h deleted file mode 100644 index 06a73f173d..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped - * - * @note The input matrices @p input0 and @p input1 must be reshaped through: - * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel - * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - */ -class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel - * - * @param[in] input0 Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_output_as_3d; - unsigned int _k; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h deleted file mode 100644 index e79f6dfe05..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped - * - * @note The input matrix input1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported - */ -class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] output Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector info of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector info of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr, - const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr, - const ITensorInfo *output_shifts = nullptr); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; - bool _is_quantized_per_channel; - bool _fuse_output_stage; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
\ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h deleted file mode 100644 index f8705595a0..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The final result is: - * - * mm_result[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - * - */ -class CLGEMMLowpOffsetContributionKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - ICLTensor *_mm_result; - const ICLTensor *_bias; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 15f54d17a5..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution - * of matrix A and matrix B and performs the output stage defined by the output_stage argument - * - * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. - */ -class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionOutputStageKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, - int32_t k, - int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_mm_result; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - ICLTensor *_output; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _is_quantized_per_channel; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h deleted file mode 100644 index 8653102cd8..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h deleted file mode 100644 index 0a8d5e1942..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Requantize - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to - * - to the [0..255] range and cast to QASYMM8. - * - to the [-128..127] range and cast to QASYMM8_SIGNED. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h deleted file mode 100644 index abdf33ea43..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Add bias to final result if bias tensor is not a nullptr - * -# Shift the int32 accumulator by result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values: - * -# -to the [0..255] range and cast to QASYMM8. - * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. - * - */ -class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
\ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h deleted file mode 100644 index 237d8099b7..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; -struct GEMMLowpReductionKernelInfo; - -/** Common interface for all OpenCL reduction kernels */ -class ICLGEMMLowpReductionKernel : public ICLKernel -{ -public: - /** Constructor */ - ICLGEMMLowpReductionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default; - - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - -protected: - const ICLTensor *_input; - ICLTensor *_output; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp index efba3d1d7a..ec0a3bf8e0 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" @@ -32,34 +32,39 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { -using namespace misc::shape_calculator; - +namespace opencl +{ +namespace kernels +{ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); @@ -75,56 +80,56 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, ARM_COMPUTE_UNUSED(n); ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k)); if(gemm_info.reinterpret_input_as_3d()) { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); } else { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m)); } - if(output->total_size() != 0) + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + bool reinterpret_dst_as_3d = (gemm_info.depth_output_gemm3d() != 0); Window win{}; bool window_changed = false; - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_dst_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_dst_as_3d) { - reinterpret_output_as_3d = false; + reinterpret_dst_as_3d = false; } - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); - TensorInfo tmp_info(*output); + TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if(reinterpret_dst_as_3d) { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); + TensorShape tmp_shape(dst->tensor_shape()); tmp_shape.collapse(2U, 1U); tmp_info.set_tensor_shape(tmp_shape); } @@ -136,16 +141,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // RHS matrix still needs padding on the X - AccessWindowStatic input1_access(input1, 0, 0, - ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), - input1->dimension(1)); + AccessWindowStatic src1_access(src1, 0, 0, + ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), + src1->dimension(1)); - window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop + window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u); + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -153,38 +158,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe } } // namespace -CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false) +ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() { _type = CLKernelType::GEMM; } -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - _input0 = input0; - _input1 = input1; - _output = output; _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ input0, output }); + auto padding_info = get_padding_info({ src0, dst }); - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; @@ -192,20 +186,20 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com } // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); + // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. const unsigned int partial_store_m0 = internal_m % lhs_info.m0; const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; @@ -218,18 +212,18 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1))); + build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); std::string kernel_name("gemmlowp_mm_native"); @@ -244,13 +238,13 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com _config_id += "_"; _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += support::cpp11::to_string(dst->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += support::cpp11::to_string(dst->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(gemm_info.k()); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); + _config_id += support::cpp11::to_string(dst->dimension(2)); _config_id += "_"; _config_id += support::cpp11::to_string(lhs_info.m0); _config_id += "_"; @@ -261,14 +255,14 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, +Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, @@ -278,15 +272,19 @@ Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, return Status{}; } -void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - if(_input1->info()->num_dimensions() < 3) + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); } Window slice = window.first_slice_window_3D(); @@ -299,7 +297,7 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::Command { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -307,7 +305,7 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::Command { // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -322,14 +320,16 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::Command } unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2])); + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h new file mode 100644 index 0000000000..491c3e44df --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ +class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyNativeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Source tensor containing the RHS matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp index fd533fc310..44fda01ded 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" @@ -32,26 +32,33 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ using namespace misc::shape_calculator; namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); @@ -65,49 +72,50 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const int n = gemm_info.n(); const int k = gemm_info.k(); - TensorShape tensor_shape0{ input0->tensor_shape() }; + TensorShape tensor_shape0{ src0->tensor_shape() }; tensor_shape0.set(0, k); tensor_shape0.set(1, m); - TensorShape tensor_shape1{ input1->tensor_shape() }; + TensorShape tensor_shape1{ src1->tensor_shape() }; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(output->total_size() != 0) + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); - TensorInfo tmp_info(*output); + TensorInfo tmp_info(*dst); if(reinterpret_output_as_3d) { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); + TensorShape tmp_shape(dst->tensor_shape()); tmp_shape.collapse(2U, 1U); tmp_info.set_tensor_shape(tmp_shape); } @@ -120,54 +128,42 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u); + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); return std::make_pair(Status{}, collapsed); } } // namespace -CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false) +ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() { _type = CLKernelType::GEMM; } -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); _k = gemm_info.k(); _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); + const unsigned int num_dimensionssrc0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); - auto padding_info = get_padding_info({ input0, input1, output }); + auto padding_info = get_padding_info({ src0, src1, dst }); ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); const unsigned int partial_store_m0 = internal_m % lhs_info.m0; const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; @@ -175,9 +171,9 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c // Create build options CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -188,8 +184,8 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); @@ -206,13 +202,13 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; _config_id += "_"; _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += support::cpp11::to_string(dst->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += support::cpp11::to_string(dst->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(gemm_info.k()); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); + _config_id += support::cpp11::to_string(dst->dimension(2)); _config_id += "_"; _config_id += support::cpp11::to_string(lhs_info.m0); _config_id += "_"; @@ -231,14 +227,14 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, +Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, @@ -248,15 +244,19 @@ Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input return Status{}; } -void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - if(_input1->info()->num_dimensions() < 3) + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); } Window slice = window.first_slice_window_3D(); @@ -267,9 +267,9 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::Comma if(_reinterpret_output_as_3d) { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -284,15 +284,17 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::Comma } unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k)); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h new file mode 100644 index 0000000000..b99dec33af --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped + * + * @note The input matrices @p src0 and @p src1 must be reshaped through: + * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel + * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + */ +class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.transpose: false + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + * + * @note lhs_info.k0 must be equal to rhs_info.k0 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_output_as_3d{ false }; + unsigned int _k{ 1 }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp index 3424b6d264..9d626936ff 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" @@ -32,39 +32,44 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" -#include <cstddef> -#include <cstdint> #include <tuple> -using namespace arm_compute::misc::shape_calculator; - namespace arm_compute { +namespace opencl +{ +namespace kernels +{ +using namespace misc::shape_calculator; + namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; @@ -79,36 +84,36 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const int n = gemm_info.n; const int k = gemm_info.k; - TensorShape tensor_shape1{ input1->tensor_shape() }; + TensorShape tensor_shape1{ src1->tensor_shape() }; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); if(gemm_info.reinterpret_input_as_3d) { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); } else { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m)); + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m)); } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); - if(output->total_size() != 0) + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); if(output_stage.type == GEMMLowpOutputStageType::NONE) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } } @@ -116,20 +121,20 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); } ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); - // Checks performed if the output stage needs to be fused + // Checks performed if the dst stage needs to be fused if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // If a_offset == 0, vector_sum_col can be a nullptr if(gemm_info.a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); } // If b_offset == 0, vector_sum_row can be a nullptr @@ -138,23 +143,23 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2])); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - if(expected_output_shape.num_dimensions() > 1) + if(expected_dst_shape.num_dimensions() > 1) { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse_from(1); - TensorShape collapsed_output_shape(expected_output_shape); - collapsed_output_shape.collapse_from(output_batch_idx); + TensorShape collapsed_dst_shape(expected_dst_shape); + collapsed_dst_shape.collapse_from(dst_batch_idx); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx], - "vector_sum_row must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], + "vector_sum_row must have the same number of batches of dst tensor"); if(gemm_info.a_offset != 0) { @@ -167,9 +172,9 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, } } - if(output->total_size() != 0) + if(dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); @@ -181,16 +186,16 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); if(output_stage.is_quantized_per_channel) { - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); } } } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) { const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -204,31 +209,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe Window win_out{}; bool window_changed = false; - // In case both input and output have to be reinterpreted as 3D tensors, + // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. if(reinterpret_input_as_3d == reinterpret_output_as_3d) { reinterpret_output_as_3d = false; } - // Output tensor auto initialization if not yet initialized - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); + // dst tensor auto initialization if not yet initialized + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); if(output_stage.type != GEMMLowpOutputStageType::NONE) { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type)); + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); } else { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32)); + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32)); } - TensorInfo tmp_info(*output); + TensorInfo tmp_info(*dst); if(reinterpret_output_as_3d) { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); + TensorShape tmp_shape(dst->tensor_shape()); tmp_shape.collapse(2U, 1U); tmp_info.set_tensor_shape(tmp_shape); } @@ -238,7 +243,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { @@ -267,7 +272,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u); + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -275,69 +280,32 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe } } // namespace -CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel() - : _input0(nullptr), - _input1(nullptr), - _output(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _slide_matrix_b(true), - _reinterpret_input_as_3d(false), - _reinterpret_output_as_3d(false), - _use_dummy_work_items(false), - _is_quantized_per_channel(false), - _fuse_output_stage(false) +ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel() { _type = CLKernelType::GEMM; } -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts); -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr)); - - auto padding_info = get_padding_info({ input0, input1, output, vector_sum_row }); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + + auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; const int32_t a_offset = gemm_info.a_offset; const int32_t b_offset = gemm_info.b_offset; - _input0 = input0; - _input1 = input1; - _output = output; - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _bias = bias; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _is_quantized_per_channel = output_stage.is_quantized_per_channel; - // In case both input and output have to be reinterpreted as 3D tensors, + // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) { @@ -346,29 +314,20 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon } // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr, - num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. // NOTE: This might have implications on heuristics and performance @@ -382,9 +341,9 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); @@ -396,8 +355,8 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -410,11 +369,11 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon if(a_offset != 0 && vector_sum_col != nullptr) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); } // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0))); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); @@ -426,7 +385,7 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon PixelValue min_val{}; PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); + std::tie(min_val, max_val) = get_min_max(dst->data_type()); build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); } @@ -441,13 +400,13 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon _config_id += "_"; _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += support::cpp11::to_string(dst->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += support::cpp11::to_string(dst->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(gemm_info.k); _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); + _config_id += support::cpp11::to_string(dst->dimension(2)); _config_id += "_"; _config_id += support::cpp11::to_string(lhs_info.m0); _config_id += "_"; @@ -461,15 +420,15 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), gemm_info, vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, @@ -482,15 +441,24 @@ Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo return Status{}; } -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - if(_input1->info()->num_dimensions() < 3) + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); } Window slice = window.first_slice_window_3D(); @@ -503,15 +471,15 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } if(_reinterpret_output_as_3d) { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -541,12 +509,12 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl } unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2])); + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); if(_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor @@ -555,20 +523,22 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl if(_reinterpret_output_as_3d) { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor idx++; } if(_fuse_output_stage) { - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); } enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h new file mode 100644 index 0000000000..9e52b38249 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped + * + * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported + */ +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. + * Only the following values are supported for LHS info: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * Only the following values are supported for RHS info: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 + * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 + * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. + * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _is_quantized_per_channel{ false }; + bool _fuse_output_stage{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp index b97bb84e59..e491cca914 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp @@ -21,18 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, @@ -93,61 +100,44 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } } // namespace -CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr) +ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() { _type = CLKernelType::ELEMENTWISE; } -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset); -} - -void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, - int32_t k, int32_t a_offset, - int32_t b_offset) +void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - a_offset, b_offset)); // NOLINT + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - // Check if input is a 3D reinterpretation const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); } // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); std::string kernel_name("gemmlowp_offset_contribution"); @@ -156,33 +146,38 @@ void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compi _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); // Set config_id for enabling LWS tuning _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); + _config_id += support::cpp11::to_string(mm_result->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); + _config_id += support::cpp11::to_string(mm_result->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); + _config_id += support::cpp11::to_string(mm_result->dimension(2)); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, +Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); return Status{}; } -void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); + + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST)); - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); // Set window for vector_sum_col @@ -203,13 +198,15 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..d1712f4f4b --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class ClGemmLowpOffsetContributionKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp index eaa832fbaa..1e2d7d7efe 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -29,15 +29,22 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); @@ -103,11 +110,11 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if((dst != nullptr) && (dst->total_size() != 0)) { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); @@ -117,91 +124,63 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } } // namespace -CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel() - : _mm_result(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _is_quantized_per_channel(false) +ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel() { _type = CLKernelType::ELEMENTWISE; } -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, +void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts); -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, ICLTensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output->info(), - a_offset, b_offset, output_stage, - output_multipliers->info(), output_shifts->info())); // NOLINT + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts }); + auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); const int min = output_stage.gemmlowp_min_bound; const int max = output_stage.gemmlowp_max_bound; - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - _output = output; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; _is_quantized_per_channel = output_stage.is_quantized_per_channel; // Check if input is a 3D reinterpretation const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Auto initialize the output - auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type)); + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); } // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); PixelValue min_val{}; PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); + std::tie(min_val, max_val) = get_min_max(dst->data_type()); build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); @@ -212,33 +191,41 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); // Set config_id for enabling LWS tuning _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); + _config_id += support::cpp11::to_string(mm_result->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); + _config_id += support::cpp11::to_string(mm_result->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); + _config_id += support::cpp11::to_string(mm_result->dimension(2)); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, +Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); return Status{}; } -void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto mm_result = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -260,15 +247,17 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_3D_tensor_argument(idx, _output, slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_3D_tensor_argument(idx, dst, slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..977f2eac53 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution + * of matrix A and matrix B and performs the output stage defined by the output_stage argument + * + * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. + */ +class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionOutputStageKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info + * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _is_quantized_per_channel{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp index 03bdf3f836..8aec1654d9 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -30,81 +30,82 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); // Check biases if exist if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(output->total_size() != 0) + if(dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } return Status{}; } } // namespace -CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) +ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel() { _type = CLKernelType::ELEMENTWISE; } -Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, +Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); return Status{}; } -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({ src, bias, dst }); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); // Set the arguments to pass at compile time auto min = info->gemmlowp_min_bound; auto max = info->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), @@ -116,38 +117,44 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - // Create input window + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Create src window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) + if(bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); + add_1D_tensor_argument(idx1, bias, biases_slice); } do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..c935aa7ec4 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp index abef484c1e..9b488ff329 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -30,18 +30,24 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) @@ -50,60 +56,48 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con // Check biases if exist if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(output->total_size() != 0) + if(dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } return Status{}; } } // namespace -class Coordinates; -CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) +ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel() { _type = CLKernelType::ELEMENTWISE; } -Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, +Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); return Status{}; } -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({ src, bias, dst }); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); - - _input = input; - _bias = bias; - _output = output; + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); auto min = info->gemmlowp_min_bound; auto max = info->gemmlowp_max_bound; @@ -111,10 +105,10 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); @@ -123,38 +117,44 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + // Create input window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) + if(bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); + add_1D_tensor_argument(idx1, bias, biases_slice); } do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h new file mode 100644 index 0000000000..eff8c4b2be --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Requantize + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFloatKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp index faf86c769e..9a25973a93 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -21,24 +21,31 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) @@ -47,68 +54,54 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con // Check biases if exist if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(output->total_size() != 0) + if(dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } return Status{}; } } //namespace -CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) +ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() { _type = CLKernelType::ELEMENTWISE; } -Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) +Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); return Status{}; } -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage); -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, +void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) { // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), - (bias != nullptr) ? bias->info() : nullptr, - output->info(), - output_stage)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({ src, bias, dst }); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type)); - - _input = input; - _bias = bias; - _output = output; + auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); // Set the arguments to pass at compile time auto min = output_stage->gemmlowp_min_bound; auto max = output_stage->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); @@ -116,43 +109,49 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); // Create kernel _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) + if(bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); + add_1D_tensor_argument(idx1, bias, biases_slice); } do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } -}
\ No newline at end of file +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h new file mode 100644 index 0000000000..c5374755c8 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * -# -to the [0..255] range and cast to QASYMM8. + * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] output_stage GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp index 55ae3ea6ca..b4886805fb 100644 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp @@ -21,76 +21,74 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { +namespace opencl +{ +namespace kernels +{ namespace { -Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); - if(output->total_size() > 0) + if(dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } -Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(output->total_size() > 0) + if(dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } } // namespace -ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() - : _input(), _output() +IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() { _type = CLKernelType::ELEMENTWISE; } -void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info); -} - -void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); // Output auto initialization if not yet initialized - auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32); + auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); - _input = mtx_a; - _output = vector_sum_row; - // Set the arguments to pass at compile time CLBuildOptions build_opts; - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type())); + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type())); build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); @@ -102,21 +100,21 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile // Configure kernel window // This kernel does not need padding - Window win = calculate_max_window(*vector_sum_row->info(), Steps()); + Window win = calculate_max_window(*vector_sum_row, Steps()); ICLKernel::configure_internal(win); _config_id = kernel_name; _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(0)); + _config_id += support::cpp11::to_string(mtx_a->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(1)); + _config_id += support::cpp11::to_string(mtx_a->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(2)); + _config_id += support::cpp11::to_string(mtx_a->dimension(2)); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); @@ -124,11 +122,14 @@ Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons return Status{}; } -void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); Window slice_in = collapsed.first_slice_window_2D(); Window slice_out = collapsed.first_slice_window_2D(); @@ -141,54 +142,46 @@ void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueu do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); } while(collapsed.slide_window_slice_2D(slice_out)); } -void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info); -} - -void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); - - _input = mtx_b; - _output = vector_sum_col; + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); // Output auto initialization if not yet initialized - auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32); + auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0))); - build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type())); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); + build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type())); build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); // Create kernel _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); + Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); @@ -196,11 +189,14 @@ Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons return Status{}; } -void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); Window slice_out = collapsed.first_slice_window_2D(); @@ -212,10 +208,12 @@ void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueu do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); } while(collapsed.slide_window_slice_2D(slice_out)); } +} // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h new file mode 100644 index 0000000000..11188ed062 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Common interface for all OpenCL reduction kernels */ +class IClGemmLowpReductionKernel : public IClKernel +{ +public: + IClGemmLowpReductionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 31c8908270..bc9a3056e8 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -29,11 +29,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClTransposeKernel.h" #include "support/Cast.h" diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 188f3b8819..cef8ad5a0d 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -31,11 +31,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLCol2ImKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" #include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/helpers/AutoConfiguration.h" diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index d5d1b5f41e..bab29a5095 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -30,11 +30,6 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" #include "src/core/CL/kernels/CLWeightsReshapeKernel.h" diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 3be09581bd..6c64731f73 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -34,12 +34,12 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClCastKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" @@ -49,6 +49,7 @@ namespace arm_compute { using namespace arm_compute::misc::shape_calculator; using namespace arm_compute::cl_gemm; +using namespace arm_compute::opencl::kernels; namespace { @@ -95,7 +96,7 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons // NOTE: This assumes: // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(CLGEMMLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) { return false; } @@ -127,15 +128,15 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } // Validate mm kernel // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info // NOTE: This assumes: - // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). - // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). GEMMKernelInfo gemm_kernel_info; gemm_kernel_info.m = m; gemm_kernel_info.n = n; @@ -147,7 +148,7 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) { return false; } @@ -189,14 +190,14 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), - _weights_to_qasymm8(std::make_unique<opencl::kernels::ClCastKernel>()), - _mm_native_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()), - _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()), - _mtx_b_reshape_kernel(std::make_unique<opencl::kernels::ClGemmReshapeRhsMatrixKernel>()), - _mtx_a_reduction_kernel(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _mtx_b_reduction_kernel(std::make_unique<CLGEMMLowpMatrixBReductionKernel>()), - _offset_contribution_kernel(std::make_unique<CLGEMMLowpOffsetContributionKernel>()), - _offset_contribution_output_stage_kernel(std::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()), + _weights_to_qasymm8(std::make_unique<ClCastKernel>()), + _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()), + _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()), + _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), + _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()), + _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()), + _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()), _qasymm8_weights(), _vector_sum_col(), _vector_sum_row(), @@ -206,6 +207,7 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo _gemm_output_stage_shifts(), _matrix_a(nullptr), _original_b(nullptr), + _c(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), @@ -235,6 +237,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->info()->quantization_info().uniform().offset; _matrix_a = a; + _c = c; _output = output; _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type()) @@ -309,7 +312,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), _vector_sum_col.info(), reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -320,7 +323,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); + _mtx_a_reduction_kernel->configure(compile_context, a->info(), _vector_sum_row.info(), reduction_info); } GEMMKernelInfo gemm_kernel_info; @@ -356,8 +359,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info, _a_offset == 0 ? nullptr : _vector_sum_col.info(), + _b_offset == 0 ? nullptr : _vector_sum_row.info(), c != nullptr ? c->info() : nullptr, _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info()); } else { @@ -367,7 +370,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), gemm_kernel_info); } else { @@ -377,11 +380,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _matrix_a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : matrix_b->info(), reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); + _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), _mm_result_s32.info(), lhs_info, rhs_info, reshape_info); - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, - a->info()->dimension(0), - _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _offset_contribution_output_stage_kernel->configure(compile_context, _mm_result_s32.info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(), + c != nullptr ? c->info() : nullptr, output->info(), a->info()->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, + _gemm_output_stage_multipliers.info(), _gemm_output_stage_shifts.info()); _mm_result_s32.allocator()->allocate(); } } @@ -402,7 +405,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), gemm_kernel_info); } else { @@ -412,12 +415,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con a->info(), _convert_to_qasymm8 ? _qasymm8_weights.info() : b->info(), reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, reshape_info); + _mm_native_kernel->configure(compile_context, _matrix_a->info(), matrix_b->info(), output->info(), lhs_info, rhs_info, reshape_info); } // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, - _b_offset); + _offset_contribution_kernel->configure(compile_context, output->info(), _a_offset == 0 ? nullptr : _vector_sum_col.info(), _b_offset == 0 ? nullptr : _vector_sum_row.info(), + c != nullptr ? c->info() : nullptr, a->info()->dimension(0), _a_offset, _b_offset); } // Allocate tensors @@ -480,7 +483,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso { b_offset = -128; weights_info.set_data_type(DataType::QASYMM8); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); + ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); } const ITensorInfo *matrix_b_info = &weights_info; if(reshape_matrix_b) @@ -496,7 +499,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } TensorInfo info_vector_sum_col{}; @@ -509,7 +512,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -518,7 +521,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); } GEMMKernelInfo gemm_kernel_info; @@ -543,7 +546,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso gemm_kernel_info.output_stage = gemmlowp_output_stage; if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -560,7 +563,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); } else { @@ -575,11 +578,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -595,7 +598,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso if(reshape_matrix_b) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); } else { @@ -606,13 +609,13 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } if(output->total_size() != 0) { // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, @@ -629,48 +632,83 @@ void CLGEMMLowpMatrixMultiplyCore::run() MemoryGroupResourceScope scope_mg(_memory_group); + const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : _original_b; + if(_is_gemm_reshaped) { + matrix_b = &_tmp_b; if(!_reshape_b_only_on_first_run) { // Run reshape matrix B - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); - CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false); + ITensorPack mtx_b_reshape_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_tmp_b } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false); + ITensorPack mtx_a_red_pack = { { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_DST, &_vector_sum_row } }; + CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); } // Run matrix multiply if(_is_gemm_reshaped) { - CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false); + ITensorPack gemm_reshaped_pack; + if(_run_offset_contribution) + { + gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_output_stage ? &_mm_result_s32 : _output } }); + } + else + { + gemm_reshaped_pack = ITensorPack( + { + { TensorType::ACL_SRC, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : &_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : &_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output }, + }); + } + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); } else { - CLScheduler::get().enqueue(*_mm_native_kernel, false); + ITensorPack gemm_native_pack = + { + { TensorType::ACL_SRC_0, _matrix_a }, { TensorType::ACL_SRC_1, matrix_b }, { TensorType::ACL_DST, _run_offset_contribution ? _output :&_mm_result_s32 } + }; + CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); } if(_run_output_stage) { // Run offset contribution/output stage kernel - CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true); + ITensorPack output_stage_pack = + { + { TensorType::ACL_SRC, &_mm_result_s32 }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col }, { TensorType::ACL_SHIFTS, &_gemm_output_stage_shifts }, { TensorType::ACL_MULTIPLIERS, &_gemm_output_stage_multipliers }, { TensorType::ACL_DST, _output }, + }; + CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); } if(_run_offset_contribution) { // Run offset contribution kernel - CLScheduler::get().enqueue(*_offset_contribution_kernel, true); + ITensorPack offset_contrib_pack = + { + { TensorType::ACL_SRC_DST, _output }, { TensorType::ACL_BIAS, _c }, { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr :&_vector_sum_row }, { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr :&_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); } } @@ -691,9 +729,11 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() // Run reshape kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); - ITensorPack mtx_b_pack; - mtx_b_pack.add_const_tensor(TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b); - mtx_b_pack.add_tensor(TensorType::ACL_DST, &_tmp_b); + ITensorPack mtx_b_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_tmp_b } + }; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); _original_b->mark_as_unused(); } @@ -702,7 +742,12 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() if(_a_offset != 0 && _reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); - CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false); + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? &_qasymm8_weights : _original_b }, + { TensorType::ACL_DST, &_vector_sum_col } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index be452aaf3d..e230e8f2e6 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,111 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Types.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" #include <algorithm> namespace arm_compute { -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() + : _kernel(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr) { - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_offset = result_offset_after_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QASYMM8_SIGNED; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_multiplier = result_fixedpoint_multiplier; - info.gemmlowp_shift = result_shift; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); - _kernel = std::move(k); -} - -Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - GEMMLowpOutputStageInfo info{}; - info.gemmlowp_min_bound = min; - info.gemmlowp_max_bound = max; - info.output_data_type = DataType::QSYMM16; - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } +CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default; +CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage() = default; void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) { @@ -140,26 +52,30 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _input = input; + _bias = bias; + _output = output; + switch(info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: { - auto k = std::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>(); - k->configure(compile_context, input, bias, output, &info); + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>(); + k->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), &info); _kernel = std::move(k); break; } @@ -176,13 +92,19 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn switch(info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN: - return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: - return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info); default: return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); } } + +void CLGEMMLowpOutputStage::run() +{ + ITensorPack pack{ { ACL_SRC, _input }, { ACL_BIAS, _bias }, { ACL_DST, _output } }; + CLScheduler::get().enqueue_op(*_kernel, pack, true); +} } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index 85d13c246e..9754bdcb82 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -30,11 +30,6 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/gpu/cl/kernels/ClTransposeKernel.h" namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index a44dcd2e24..589523a3c3 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -28,11 +28,6 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include <memory> @@ -179,7 +174,13 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _memory_group.manage(&_output_lowp); - _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info); _output_highp.allocator()->allocate(); _bias.allocator()->allocate(); @@ -386,7 +387,12 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + GEMMLowpOutputStageInfo info{}; + info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + info.gemmlowp_multiplier = output_multiplier; + info.gemmlowp_shift = output_shift; + info.output_data_type = DataType::QSYMM16; + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info)); TensorInfo input_gate_input; TensorInfo forget_gate_input; diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index fcf5b9d2a4..5df895a91c 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -31,17 +31,14 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "src/core/helpers/WindowHelpers.h" namespace arm_compute { using namespace arm_compute::utils::info_helpers; +using namespace arm_compute::opencl::kernels; namespace { Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, @@ -93,15 +90,15 @@ void CLQLSTMLayer::TensorCopyKernel::run() } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _input_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_input_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_forget_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_cell_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _input_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _recurrent_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), - _projection_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()), + : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), _layer_norms(), _copy_output() { @@ -247,18 +244,22 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); if(_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); if(_projection_bias != nullptr) { _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); @@ -677,19 +678,19 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); if(!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); if(lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); if(lstm_params.projection_bias() != nullptr) @@ -1128,8 +1129,12 @@ void CLQLSTMLayer::prepare() { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_input_reduction); - CLScheduler::get().enqueue(*_recurrent_to_input_reduction); + + ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); + + ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1144,17 +1149,30 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_forget_reduction); - CLScheduler::get().enqueue(*_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(*_input_to_cell_reduction); - CLScheduler::get().enqueue(*_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(*_input_to_output_reduction); - CLScheduler::get().enqueue(*_recurrent_to_output_reduction); + + ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); + + ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); + + ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); + + ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); + + ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); + + ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); if(_has_projection) { _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_projection_reduction); + ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); if(_projection_bias != nullptr) { _projection_bias_add.run(); diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 755fa40121..20deef4edf 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -29,11 +29,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" namespace arm_compute { diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp index 1c7446f653..52adb94c83 100644 --- a/tests/validation/CL/GEMMLowp.cpp +++ b/tests/validation/CL/GEMMLowp.cpp @@ -213,140 +213,6 @@ TEST_SUITE_END() // BoundedReLu TEST_SUITE_END() // QASYMM8_SIGNED TEST_SUITE_END() // QuantizeDownInt32Scale -TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint) - -TEST_SUITE(QASYMM8) - -const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true }); -using CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>; - -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), - quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QASYMM8 -TEST_SUITE(QASYMM8_SIGNED) -const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2) - * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128, -126) * framework::dataset::make("max", 110, 112) * framework::dataset::make("addBias", { false, true }); -using CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>; - -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int8_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} - -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QASYMM8_SIGNED -TEST_SUITE(QSYMM16) - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, - 2) - * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823, - 1073741825) - * framework::dataset::make("result_shift", -3, - -2) - * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true }); - -const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, - 254601602) - * framework::dataset::make("result_shift", -3, - -1) - * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true }); - -using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture = - GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>; - -TEST_SUITE(NoRelu) -TEST_SUITE(MultSmallerEq1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultSmallerEq1 -TEST_SUITE(MultGreater1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultGreater1 -TEST_SUITE_END() // NoRelu -TEST_SUITE(BoundedReLu) -TEST_SUITE(MultSmallerEq1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultSmallerEq1 -TEST_SUITE(MultGreater1) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), - quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases)) -{ - // Validate output - validate(CLAccessor(_target), _reference); -} -TEST_SUITE_END() // MultGreater1 -TEST_SUITE_END() // BoundedReLu -TEST_SUITE_END() // QSYMM16 -TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint - TEST_SUITE(QuantizeDownInt32ScaleByFloat) TEST_SUITE(QASYMM8) diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp index 1057af95f2..d733a00296 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" #include "tests/framework/Asserts.h" @@ -41,7 +41,7 @@ namespace validation using namespace arm_compute::misc::shape_calculator; // Create function for CLGEMMMatrixMultiplyNativeKernel -using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyNativeKernel>; +using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel>; // Fixture for CLGEMMLowpMatrixMultiplyNative using CLGEMMLowpMatrixMultiplyNativeFixture = GEMMLowpMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyNative>; diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp index 68a7d055ad..3baa39bffc 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "tests/CL/CLAccessor.h" @@ -49,7 +49,7 @@ using CLGEMMReshapeLHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmResha using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>; // Create function for CLGEMMLowpMatrixMultiplyReshapedKernel -using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>; +using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedKernel>; // Fixture for CLGEMMLowpMatrixMultiplyReshaped using CLGEMMLowpMatrixMultiplyReshapedFixture = GEMMLowpMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshaped>; diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp index 43b86b51e8..1283713c4d 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp @@ -25,7 +25,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" #include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" @@ -49,7 +49,7 @@ using namespace arm_compute::misc::shape_calculator; using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>; // Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>; +using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>; // Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFixture = GEMMLowpMatrixMultiplyReshapedOnlyRHSValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS>; @@ -157,7 +157,7 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned // Create and configure function CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm; - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); } } // namespace diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index ab9d35de0f..5e2154592e 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -959,7 +959,7 @@ protected: GEMMFunctionType gemm; reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -988,7 +988,8 @@ protected: reshape_lhs.run(reshape_lhs_pack); ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1113,7 +1114,7 @@ protected: GEMMFunctionType gemm; reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1142,7 +1143,8 @@ protected: reshape_lhs.run(reshape_lhs_pack); ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1266,7 +1268,7 @@ protected: ReshapeRHSOperatorType reshape_rhs; GEMMFunctionType gemm; reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1291,7 +1293,8 @@ protected: // Compute GEMM ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1412,7 +1415,7 @@ protected: ReshapeRHSOperatorType reshape_rhs; GEMMFunctionType gemm; reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1437,7 +1440,8 @@ protected: // Compute GEMM ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; reshape_rhs.run(reshape_rhs_pack); - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1527,7 +1531,7 @@ protected: // Create and configure function GEMMFunctionType gemm; - gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); + gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1548,7 +1552,8 @@ protected: fill(AccessorType(rhs), 1); // Compute GEMM - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } @@ -1624,7 +1629,7 @@ protected: // Create and configure function GEMMFunctionType gemm; - gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); + gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h)); ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); @@ -1645,7 +1650,8 @@ protected: fill(AccessorType(rhs), 1); // Compute GEMM - gemm.run(); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } }); + gemm.run(gemm_pack); return dst; } |