From a0ae8d2e6c57fd95c0edaf659b9df8b8c540d051 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 12 Dec 2022 17:47:49 +0000 Subject: Optimize Transposed Convolution for CL backend (Quantized) This patch optimizes transposed convolution for QASYMM and QASYMM8_SIGNED types, by extending the transposed convolution kernel written for FP32/16. Resolves: COMPMID-5723 Change-Id: Iab8f09231938adb949c506fd915ed45b885e5c7c Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8792 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../cl/kernels/ClTransposedConvolutionKernel.cpp | 51 +++++++++++++++++++--- src/gpu/cl/operators/ClTransposedConvolution.h | 4 +- 2 files changed, 47 insertions(+), 8 deletions(-) (limited to 'src/gpu/cl') diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp index 16c6ad9a9b..714ca8e6d1 100644 --- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp @@ -30,6 +30,8 @@ #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + namespace arm_compute { namespace opencl @@ -42,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC); @@ -57,7 +59,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, if(biases != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + if(is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx), "Biases size and number of dst feature maps should match"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional"); @@ -127,12 +137,12 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co const std::string kernel_name = "transposed_convolution_nhwc"; CLBuildOptions build_options; - const DataType input_data_type = input->data_type(); // Fp32 or Fp16 only - const auto strides = deconv_info.stride(); + const DataType input_data_type = input->data_type(); + const PaddingInfo strides = deconv_info.stride(); const unsigned int n0 = 1; const unsigned int m0 = 1; - const unsigned int k0 = adjust_vec_size(input_data_type == DataType::F32 ? 4 : 8, input_channels); + const unsigned int k0 = adjust_vec_size(16 / input->element_size(), input_channels); const unsigned int partial_store_n0 = output_channels % n0; if(biases != nullptr) @@ -167,7 +177,36 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP"); - build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type)); + + if(is_data_type_quantized(output_data_type)) + { + const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); + + PixelValue zero_value = PixelValue(0, input->data_type(), input->quantization_info()); + int zero_value_s32; + zero_value.get(zero_value_s32); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DIS_QUANTIZED"); + build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); + } + else + { + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } if(compile_context.get_ddk_version() >= 30) { diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h index bc04387df5..58ebc689ed 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.h +++ b/src/gpu/cl/operators/ClTransposedConvolution.h @@ -57,11 +57,11 @@ public: * * @param[in] compile_context The compile context to be used. * @param[in] input Input tensor info with dimensions [IFM, width, height, batch] - * Data types supported: F16/F32. + * Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. * @param[in] weights Weight tensor info with dimensions [IFM, width, height, OFM]. * Data type supported: Same as @p input * @param[in] biases (Optional) Biases tensor info. Biases are 1D tensor with dimension [OFM]. - * Data type supported: Should match @p input data type + * Data type supported: Should match @p input data type if floating point, otherwise S32. * @param[out] output Output tensor info with dimensions [OFM, width, height, batch] * The 1st dimension must be equal to the 4th dimension of the @p weights tensor. * Data types supported: Same as @p input. -- cgit v1.2.1