From df4cf57c7394265b27d051cb1cf0152c53659126 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 9 Oct 2019 15:32:39 +0100 Subject: COMPMID-2306: CLDepthwiseConvolution: support for QUANT8_PER_CHANNEL_SYMM Change-Id: I18c886400daa2dcba0b91011bc4e503d807a4732 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2143 Comments-Addressed: Arm Jenkins Reviewed-by: Giorgio Arena Tested-by: Arm Jenkins --- src/core/CL/CLHelpers.cpp | 48 + src/core/CL/CLKernelLibrary.cpp | 10 +- .../cl_kernels/depthwise_convolution_quantized.cl | 1201 ++++++++++++-------- src/core/CL/cl_kernels/helpers_asymm.h | 22 +- .../CL/kernels/CLChannelShuffleLayerKernel.cpp | 16 +- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 134 ++- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 133 ++- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 131 ++- ...pthwiseConvolutionLayerReshapeWeightsKernel.cpp | 3 +- .../CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp | 22 +- .../CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp | 22 +- .../CL/kernels/CLHeightConcatenateLayerKernel.cpp | 30 +- src/core/CL/kernels/CLPermuteKernel.cpp | 5 - src/core/CL/kernels/CLReverseKernel.cpp | 15 +- src/core/Utils.cpp | 3 + src/core/utils/quantization/AsymmHelpers.cpp | 22 + 16 files changed, 1079 insertions(+), 738 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index a3c73677c7..1132aa4540 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -70,6 +70,54 @@ std::string get_cl_type_from_data_type(const DataType &dt) } } +std::string get_cl_promoted_type_from_data_type(const DataType &dt) +{ + switch(dt) + { + case DataType::U8: + case DataType::QASYMM8: + case DataType::QASYMM8_PER_CHANNEL: + return "ushort"; + case DataType::S8: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + return "short"; + case DataType::U16: + case DataType::QASYMM16: + return "uint"; + case DataType::S16: + case DataType::QSYMM16: + return "int"; + case DataType::U32: + return "ulong"; + case DataType::S32: + return "long"; + case DataType::F16: + return "float"; + default: + ARM_COMPUTE_ERROR("Cannot get promoted OpenCL type for the input data type."); + return ""; + } +} + +std::string get_cl_unsigned_type_from_element_size(size_t element_size) +{ + switch(element_size) + { + case 1: + return "uchar"; + case 2: + return "ushort"; + case 4: + return "uint"; + case 8: + return "ulong"; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + return ""; + } +} + std::string get_cl_select_type_from_data_type(const DataType &dt) { switch(dt) diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index b2905a848b..5d5205439e 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -223,11 +223,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" }, { "dwc_MxN_native_fp_nhwc", "depthwise_convolution.cl" }, { "dwc_MxN_native_quantized8_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_quantized8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_quantized8_dot8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, { "depth_to_space_nchw", "depth_to_space.cl" }, { "depth_to_space_nhwc", "depth_to_space.cl" }, { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" }, diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 94373b74e7..dbcfae610f 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -33,7 +33,6 @@ #endif /* VEC_SIZE */ #if defined(ACTIVATION_TYPE) && defined(CONST_0) -#define DATA_TYPE uchar #include "activation_layer_quant.cl" #define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x) #else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */ @@ -42,11 +41,16 @@ #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE) #define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE) -#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) +#if defined(DATA_TYPE) && defined(WEIGHTS_TYPE) + +#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size) + +#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) + +#if defined(WEIGHTS_PROMOTED_TYPE) +#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size) #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) @@ -62,77 +66,77 @@ #error "Stride X not supported" #endif /* CONV_STRIDE_X > 3 */ -#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) +#if !defined(IS_DOT8) #if DILATION_X == 1 #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int8 temp0 = CONVERT(vload8(0, first_value), int8); \ - int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ + int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \ \ - left = CONVERT(temp0.s01234567, int8); \ - middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ - right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ + left = CONVERT(temp0.s01234567, int8); \ + middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ + right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int temp1 = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \ \ - left = CONVERT(temp0.s02468ace, int8); \ - middle = CONVERT(temp0.s13579bdf, int8); \ - right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ + left = CONVERT(temp0.s02468ace, int8); \ + middle = CONVERT(temp0.s13579bdf, int8); \ + right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ \ - left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ - middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ - right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ + left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ + right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ }) #endif /* CONV_STRIDE_X */ #else /* DILATION_X == 1 */ #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - left = CONVERT(vload8(0, first_value), int8); \ - middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8); \ - right = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + left = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ + middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8); \ + right = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - left = CONVERT(temp0.s02468ace, int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + left = CONVERT(temp0.s02468ace, int8); \ \ - temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \ - middle = CONVERT(temp0.s02468ace, int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ + middle = CONVERT(temp0.s02468ace, int8); \ \ - temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \ - right = CONVERT(temp0.s02468ace, int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ + right = CONVERT(temp0.s02468ace, int8); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \ - left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ + left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ \ - temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \ - temp1 = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8); \ - middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ + temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8); \ + middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ \ - temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \ - temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \ - right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ + temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \ + right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ }) #endif /* CONV_STRIDE_X */ @@ -140,49 +144,61 @@ /** This function computes the depthwise convolution quantized. * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void dwc_3x3_native_qasymm8_nchw( +__kernel void dwc_3x3_native_quantized8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -198,9 +214,20 @@ __kernel void dwc_3x3_native_qasymm8_nchw( src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; - uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y); - uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y); - uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y)); + +#if defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, channel)); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); +#endif // defined(PER_CHANNEL_QUANTIZATION) int8 values0 = 0; int8 sum0 = 0; @@ -285,9 +312,10 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #if INPUT_OFFSET != 0 - ushort sum_weights = 0; - ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2); - sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2; + VEC_WEIGHTS_PROMOTED_TYPE(3) + tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3)); + + WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2; values0 += sum_weights * (int8)(INPUT_OFFSET); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 values1 += sum_weights * (int8)(INPUT_OFFSET); @@ -307,14 +335,13 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values0 += (int8)OUTPUT_OFFSET; - uchar8 res0 = convert_uchar8_sat(values0); - res0 = max(res0, (uchar8)0); - res0 = min(res0, (uchar8)255); + VEC_TYPE(8) + res0 = CONVERT_SAT(values0, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -324,134 +351,156 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values1 += (int8)OUTPUT_OFFSET; - uchar8 res1 = convert_uchar8_sat(values1); - res1 = max(res1, (uchar8)0); - res1 = min(res1, (uchar8)255); + VEC_TYPE(8) + res1 = CONVERT_SAT(values1, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y); #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/ } -#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) +#else // !defined(IS_DOT8) + #if DILATION_X == 1 #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar8 temp0 = vload8(0, first_value); \ - uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(8) \ + temp0 = vload8(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(2) \ + temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \ \ - left = temp0.s01234567; \ - middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \ - right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \ + left = temp0.s01234567; \ + middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0); \ + right = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar temp1 = *(first_value + 16 * sizeof(uchar)); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ \ - left = temp0.s02468ace; \ - middle = temp0.s13579bdf; \ - right = (uchar8)(temp0.s2468, temp0.sace, temp1); \ + left = temp0.s02468ace; \ + middle = temp0.s13579bdf; \ + right = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(8) \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ \ - left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ - middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \ - right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \ + left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ + middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036); \ + right = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147); \ }) #endif /* CONV_STRIDE_X */ #else /*DILATION_X==1*/ #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - left = vload8(0, first_value); \ - middle = vload8(0, first_value + DILATION_X * sizeof(uchar)); \ - right = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + left = vload8(0, (__global DATA_TYPE *)(first_value)); \ + middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + right = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - left = temp0.s02468ace; \ - temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \ - middle = temp0.s02468ace; \ - temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ - right = temp0.s02468ace; \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + left = temp0.s02468ace; \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + middle = temp0.s02468ace; \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ + right = temp0.s02468ace; \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \ - left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(8) \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE)))); \ + left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ \ - temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \ - temp1 = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))); \ - middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))); \ + middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ \ - temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ - temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \ - right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \ + right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ }) #endif /* CONV_STRIDE_X */ #endif /*DILATION_X==1*/ /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW. * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @note Per-channel quantization is not supported by this kernel. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void dwc_3x3_native_qasymm8_dot8_nchw( +__kernel void dwc_3x3_native_quantized8_dot8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -467,13 +516,22 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; - uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y); - uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y); - uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y); + VEC_TYPE(3) + w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y)); + VEC_TYPE(3) + w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y)); + VEC_TYPE(3) + w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y)); - uchar8 left0, middle0, right0; - uchar8 left1, middle1, right1; - uchar8 left2, middle2, right2; + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); + + VEC_TYPE(8) + left0, middle0, right0; + VEC_TYPE(8) + left1, middle1, right1; + VEC_TYPE(8) + left2, middle2, right2; int8 values0 = 0; int8 sum0 = 0; @@ -491,9 +549,10 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 // If conv_stride_y is equals to 1, we compute two output rows - uchar8 left3, middle3, right3; - int8 values1 = 0; - int8 sum1 = 0; + VEC_TYPE(8) + left3, middle3, right3; + int8 values1 = 0; + int8 sum1 = 0; GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3); @@ -504,69 +563,69 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 - ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0); - ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0); + ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0); + ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0); values0.s0 += right2.s0 * w2.s2; - ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1); - ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1); + ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1); + ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1); values0.s1 += right2.s1 * w2.s2; - ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2); - ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2); + ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2); + ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2); values0.s2 += right2.s2 * w2.s2; - ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3); - ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3); + ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3); + ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3); values0.s3 += right2.s3 * w2.s2; - ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4); - ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4); + ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4); + ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4); values0.s4 += right2.s4 * w2.s2; - ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5); - ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5); + ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5); + ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5); values0.s5 += right2.s5 * w2.s2; - ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6); - ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6); + ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6); + ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6); values0.s6 += right2.s6 * w2.s2; - ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7); - ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7); + ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7); + ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7); values0.s7 += right2.s7 * w2.s2; #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 - ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0); - ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0); + ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0); + ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0); values1.s0 += right3.s0 * w2.s2; - ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1); - ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1); + ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1); + ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1); values1.s1 += right3.s1 * w2.s2; - ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2); - ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2); + ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2); + ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2); values1.s2 += right3.s2 * w2.s2; - ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3); - ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3); + ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3); + ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3); values1.s3 += right3.s3 * w2.s2; - ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4); - ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4); + ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4); + ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4); values1.s4 += right3.s4 * w2.s2; - ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5); - ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5); + ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5); + ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5); values1.s5 += right3.s5 * w2.s2; - ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6); - ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6); + ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6); + ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6); values1.s6 += right3.s6 * w2.s2; - ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7); - ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7); + ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7); + ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7); values1.s7 += right3.s7 * w2.s2; #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 @@ -585,8 +644,9 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #if INPUT_OFFSET != 0 - ushort sum_weights = 0; - ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2); + WEIGHTS_PROMOTED_TYPE sum_weights = 0; + VEC_WEIGHTS_PROMOTED_TYPE(3) + tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3)); sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2; values0 += sum_weights * (int8)(INPUT_OFFSET); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -607,14 +667,13 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values0 += (int8)OUTPUT_OFFSET; - uchar8 res0 = convert_uchar8_sat(values0); - res0 = max(res0, (uchar8)0); - res0 = min(res0, (uchar8)255); + VEC_TYPE(8) + res0 = CONVERT_SAT(values0, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -625,20 +684,19 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values1 += (int8)OUTPUT_OFFSET; - uchar8 res1 = convert_uchar8_sat(values1); - res1 = max(res1, (uchar8)0); - res1 = min(res1, (uchar8)255); + VEC_TYPE(8) + res1 = CONVERT_SAT(values1, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y); #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/ } -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#endif // !defined(IS_DOT8) #endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */ @@ -646,7 +704,7 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE) -#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT) +#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT) #if WEIGHTS_OFFSET != 0 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \ @@ -661,23 +719,23 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \ ({ \ - ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc); \ - ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc); \ + ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \ + ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \ acc += val8 * w1; \ }) #define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \ ({ \ sum = val0; \ - ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum); \ - ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum); \ + ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \ + ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \ }) #define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \ ({ \ sum = w1; \ - ARM_DOT(w0.s0123, (uchar4)1, sum); \ - ARM_DOT(w0.s4567, (uchar4)1, sum); \ + ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \ + ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \ }) #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) @@ -694,42 +752,52 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X) * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1) * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset Max offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset Max offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif /* defined(HAS_BIAS) */ @@ -741,7 +809,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -749,7 +817,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -768,19 +836,30 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( VEC_INT acc = 0, sum = 0; // Load weights - uchar16 w0_tmp = VLOAD(16)(0, weights_addr); - uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); - uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); - - uchar4 w0 = w0_tmp.s0123; - uchar4 w1 = w0_tmp.s4567; - uchar4 w2 = w0_tmp.s89AB; - uchar4 w3 = w0_tmp.sCDEF; - - uchar4 w4 = w1_tmp.s0123; - uchar4 w5 = w1_tmp.s4567; - uchar4 w6 = w1_tmp.s89AB; - uchar4 w7 = w1_tmp.sCDEF; + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16)); + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w0 = w0_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w1 = w0_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w2 = w0_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w3 = w0_tmp.sCDEF; + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w4 = w1_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w5 = w1_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w6 = w1_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -798,27 +877,36 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); // z == 2 // Offset can be out-of-bound so we need to check if it is greater than max_offset - z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2; - offset = y_offset + (int4)(z_coord * src_stride_z); - offset = min(offset, (int4)max_offset); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2; + offset = y_offset + (int4)(z_coord * src_stride_z); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum); MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum); @@ -854,24 +942,34 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); -#else // defined(REAL_MULTIPLIER) +#else // defined(REAL_MULTIPLIER) +#if defined(PER_CHANNEL_QUANTIZATION) + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); + VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); + VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); - acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); #endif // defined(REAL_MULTIPLIER) acc += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR); - res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z; #endif /* defined(DST_DEPTH) */ VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res), 0, dst_addr); + (ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr)); } #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) @@ -887,43 +985,53 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1) * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1). * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset Max offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset Max offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif /* defined(HAS_BIAS) */ @@ -935,7 +1043,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -943,7 +1051,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -965,19 +1073,30 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( VEC_INT acc3 = 0, sum3 = 0; // Load weights - uchar16 w0_tmp = VLOAD(16)(0, weights_addr); - uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); - uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); - - uchar4 w0 = w0_tmp.s0123; - uchar4 w1 = w0_tmp.s4567; - uchar4 w2 = w0_tmp.s89AB; - uchar4 w3 = w0_tmp.sCDEF; - - uchar4 w4 = w1_tmp.s0123; - uchar4 w5 = w1_tmp.s4567; - uchar4 w6 = w1_tmp.s89AB; - uchar4 w7 = w1_tmp.sCDEF; + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16)); + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w0 = w0_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w1 = w0_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w2 = w0_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w3 = w0_tmp.sCDEF; + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w4 = w1_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w5 = w1_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w6 = w1_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -995,40 +1114,56 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 2 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)src_stride_z; - offset = min(offset, (int4)max_offset); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 3 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)(src_stride_z); - offset = min(offset, (int4)max_offset); - VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0); MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0); @@ -1115,10 +1250,20 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #else // defined(REAL_MULTIPLIER) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#if defined(PER_CHANNEL_QUANTIZATION) + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); + VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); + VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); #endif // defined(REAL_MULTIPLIER) @@ -1127,15 +1272,14 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( acc2 += (VEC_INT)OUTPUT_OFFSET; acc3 += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR); - VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR); - VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR); - VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR); - - res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255); - res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255); - res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255); - res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w; @@ -1153,15 +1297,16 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) { VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z); + (ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z)); VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z); + (ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z)); } } #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product. * + * @note Per-channel quantization is not supported by this kernel. * @note This kernel assumes VEC_SIZE is 4. * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel. * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2) @@ -1173,42 +1318,52 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication. * If not, the quantization will be performed using a fixed point multiplication * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset The maximum allowed offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset The maximum allowed offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif // defined(HAS_BIAS) @@ -1220,7 +1375,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -1228,7 +1383,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -1250,16 +1405,19 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( VEC_INT sum1 = 0; // Load weights - uchar16 w0 = VLOAD(16)(0, weights_addr); - uchar16 w1 = VLOAD(16)(0, weights_addr + 16); - uchar4 w2 = VLOAD(4)(0, weights_addr + 32); + VEC_TYPE(16) + w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_TYPE(16) + w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_TYPE(4) + w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32)); #if INPUT_OFFSET != 0 // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8); - DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA); - DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); // Multiply the weights reduction with INPUT_OFFSET acc0 = INPUT_OFFSET * acc0; @@ -1277,30 +1435,42 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z - (int)CONV_PAD_TOP + 1; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + z_coord = z - (int)CONV_PAD_TOP + 1; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 2 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)src_stride_z; - offset = min(offset, (int4)max_offset); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0); DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0); @@ -1309,8 +1479,8 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1); DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1); - DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); - DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2); DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2); @@ -1319,8 +1489,8 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3); DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3); - DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); - DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); #if defined(HAS_BIAS) Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); @@ -1349,19 +1519,20 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); #endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; acc1 += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR); - VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR); - - res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255); - res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w; @@ -1370,9 +1541,9 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #endif /* defined(DST_DEPTH) */ VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y); + (ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)); VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y); + (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); } #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4 @@ -1380,9 +1551,11 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) -#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) +#endif // defined(WEIGHTS_PROMOTED_TYPE) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) + +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2) @@ -1398,43 +1571,53 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ __kernel void dwc_MxN_native_quantized8_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) @@ -1447,19 +1630,30 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) - __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(uchar) * (int)N0; + __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; - __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; + __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; - __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0; #if defined(HAS_BIAS) __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; #endif // defined(HAS_BIAS) +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + + VEC_INT output_multiplier = (VEC_INT)0; + VEC_INT output_shift = (VEC_INT)0; +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + #if defined(DST_DEPTH) s_addr += b * src_stride_w; d_addr += b * dst_stride_w; @@ -1489,8 +1683,8 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int w_offset = xk * weights_stride_y + yk * weights_stride_z; // Load input and weights values - VEC_SHORT i = CONVERT(VLOAD(N0)(0, (__global uchar *)(s_addr + s_offset)), VEC_SHORT); - VEC_SHORT w = CONVERT(VLOAD(N0)(0, (__global uchar *)(w_addr + w_offset)), VEC_SHORT); + VEC_SHORT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_SHORT); + VEC_SHORT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_SHORT); res += (i + (VEC_SHORT)INPUT_OFFSET) * (w + (VEC_SHORT)WEIGHTS_OFFSET); } @@ -1505,21 +1699,32 @@ __kernel void dwc_MxN_native_quantized8_nhwc( res += bias; #endif // defined(HAS_BIAS) - res = CONVERT(ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(CONVERT(res, VEC_INT), OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0), VEC_SHORT); +#if defined(PER_CHANNEL_QUANTIZATION) + output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); + output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + res = CONVERT(ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(CONVERT(res, VEC_INT), output_multiplier, output_shift, N0), VEC_SHORT); res += (VEC_SHORT)OUTPUT_OFFSET; - VEC_UCHAR res1 = CONVERT_SAT(res, VEC_UCHAR); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE)); VSTORE(N0) - (ACTIVATION_FUNC(res1), 0, (__global uchar *)(d_addr)); + (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr)); #if DEPTH_MULTIPLIER > 1 - w_addr += sizeof(uchar); - d_addr += sizeof(uchar); + w_addr += sizeof(WEIGHTS_TYPE); + d_addr += sizeof(DATA_TYPE); +#if defined(PER_CHANNEL_QUANTIZATION) + out_mul_addr += sizeof(int); + out_shift_addr += sizeof(int); +#endif // defined(PER_CHANNEL_QUANTIZATION) #if defined(HAS_BIAS) b_addr += sizeof(int); #endif // defined(HAS_BIAS) } #endif // DEPTH_MULTIPLIER > 1 } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 57ecccc2b2..f115602a1a 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -93,16 +93,18 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) \ + zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) \ + one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp index f232f6cfc0..e883e8f250 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp @@ -113,21 +113,7 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast(channels - vec_size), 0))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); // Create kernel std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout)); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index 42e5fbc8f2..a2f4a913ce 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -37,13 +37,15 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D dilation) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -52,7 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC), "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3); @@ -74,28 +75,43 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - if(output->total_size() != 0) + if(is_qasymm) { - const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(is_qasymm) + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); + } + } + else { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + if(output->total_size() != 0) + { + const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation) +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation) { // Output auto inizialitation if not yet initialized const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); @@ -182,9 +198,9 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } else { - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type()); - kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3"; + kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3"; kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : ""); kernel_name += (is_qasymm ? "_nchw" : ""); @@ -224,23 +240,28 @@ BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const return _border_size; } -void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation)); - - bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); - - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _conv_stride_x = conv_info.stride().first; - _conv_stride_y = conv_info.stride().second; - _conv_pad_left = conv_info.pad_left(); - _conv_pad_top = conv_info.pad_top(); - _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, act_info, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _conv_stride_x = conv_info.stride().first; + _conv_stride_y = conv_info.stride().second; + _conv_pad_left = conv_info.pad_left(); + _conv_pad_top = conv_info.pad_top(); + _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); // Configure kernel window std::string kernel_name; @@ -260,24 +281,21 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); - if(is_qasymm) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); if(act_info.enabled()) { @@ -293,6 +311,10 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); + build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type())); } else { @@ -323,12 +345,15 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, _config_id += support::cpp11::to_string(output->info()->dimension(1)); } -Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, + const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { std::string kernel_name; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), + conv_info, depth_multiplier, gpu_target, kernel_name, dilation) + .first); return Status{}; } @@ -353,18 +378,28 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::Com slice_weights.set_dimension_step(Window::DimX, 0); slice_weights.set_dimension_step(Window::DimY, 0); + unsigned int idx = 3 * num_arguments_per_3D_tensor(); + + // Set output multipliers in case of quantized data type + if(_is_quantized) + { + Window slice; + slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape()); + add_1D_tensor_argument(idx, _output_multipliers, slice); + add_1D_tensor_argument(idx, _output_shifts, slice); + } + // Set biases if(_biases != nullptr) { - unsigned int idx = 3 * num_arguments_per_3D_tensor(); - Window slice_biases; + Window slice_biases; slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); add_1D_tensor_argument(idx, _biases, slice_biases); } do { - unsigned int idx = 0; + idx = 0; add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); add_3D_tensor_argument(idx, _weights, slice_weights); @@ -373,3 +408,4 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::Com } while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in)); } +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index b8b144dbfa..d5f37f32ce 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -41,17 +41,18 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) + && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC), "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1); @@ -63,26 +64,47 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const size_t weights_width = 3; const size_t weights_height = 3; + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( + *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation); if(is_qasymm) { DepthwiseConvolutionReshapeInfo info; info.c0 = 4; ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); + } } else { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height)); } if(biases != nullptr) { + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]); if(is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } else { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); } @@ -91,27 +113,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( - *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } - if(is_qasymm) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; - - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); - } - return Status{}; } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) { const size_t weights_width = 3; const size_t weights_height = 3; @@ -144,7 +154,17 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen if(is_qasymm) { - window_changed = update_window_and_padding(win, input_access, output_access); + if((output_multipliers != nullptr) && (output_shifts != nullptr)) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_accessed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access, output_access, output_multipliers_access, output_shifts_access); + } + else + { + Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input"); + return std::make_pair(err, win); + } } else { @@ -157,7 +177,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration); window_changed = window_changed || update_window_and_padding(win, bias_access); } - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -175,19 +194,26 @@ BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const return _border_size; } -void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation)); - auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, act_info, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; _input = input; _output = output; @@ -196,16 +222,19 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _conv_stride_y = conv_info.stride().second; _num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1; _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1; + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1 - if(is_dot8_supported && is_qasymm) + if(is_dot8_supported && _is_quantized) { _num_planes_processed_per_iteration = 1; } - _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); + _border_size = BorderSize(_is_quantized && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); - const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size()); + const unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : (8 / input->info()->element_size()); CLBuildOptions build_opts; build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); @@ -217,24 +246,19 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - if(is_qasymm) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1))); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); if(act_info.enabled()) { @@ -250,6 +274,10 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); + build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type())); } else { @@ -274,9 +302,9 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, std::string kernel_name; // Create kernel - if(is_qasymm) + if(_is_quantized) { - kernel_name = std::string("dwc_3x3_reshaped_qasymm8"); + kernel_name = std::string("dwc_3x3_reshaped_quantized8"); kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : ""); kernel_name += (is_stride_1_dilation_1 ? "_stride1" : ""); kernel_name += "_nhwc"; @@ -309,13 +337,16 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _config_id += string_from_data_type(input->info()->data_type()); } -Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), biases != nullptr ? biases->clone().get() : nullptr, - output->clone().get(), conv_info, depth_multiplier, dilation) + output->clone().get(), conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr, + (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr) .first); return Status{}; @@ -329,7 +360,6 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3); - const bool is_qasymm = is_data_type_quantized_asymmetric(_input->info()->data_type()); Window win = window_collapsed; win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration)) * total_batches, 1)); @@ -344,7 +374,16 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com Window slice_in = win_in.first_slice_window_4D(); Window slice_out = win.first_slice_window_4D(); - unsigned int idx = 2 * num_arguments_per_4D_tensor() + (is_qasymm ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + + if(_is_quantized) + { + Window slice; + slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape()); + slice.set_dimension_step(Window::DimX, window.x().step()); + add_1D_tensor_argument(idx, _output_multipliers, slice); + add_1D_tensor_argument(idx, _output_shifts, slice); + } if(_biases != nullptr) { @@ -398,7 +437,7 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com unsigned int idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_4D_tensor_argument(idx, _output, slice_out); - if(is_qasymm) + if(_is_quantized) { add_2D_tensor_argument(idx, _weights, slice_out); } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 2115fc614d..3fc236eaa7 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -42,13 +42,13 @@ namespace arm_compute namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1); @@ -57,24 +57,53 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_UNUSED(idx_c); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier)); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); + + const bool is_quantized = is_data_type_quantized(input->data_type()); + if(biases != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(input->data_type())) + if(is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + } + } + + if(is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); } } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } @@ -82,7 +111,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); @@ -113,6 +143,21 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen window_changed = update_window_and_padding(win, input_access, weights_access, output_access); } + if(is_data_type_quantized(input->data_type())) + { + if((output_multipliers != nullptr) && (output_shifts != nullptr)) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, n0); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, n0); + window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); + } + else + { + Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input"); + return std::make_pair(err, win); + } + } + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -121,32 +166,44 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel() - : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _depth_multiplier(1) + : _input(nullptr), + _weights(nullptr), + _biases(nullptr), + _output(nullptr), + _depth_multiplier(1), + _output_multipliers(nullptr), + _output_shifts(nullptr), + _is_quantized(false) { } void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation); + auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _depth_multiplier = depth_multiplier; + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _depth_multiplier = depth_multiplier; + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized(input->info()->data_type()); const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); const size_t weights_width = weights->info()->dimension(idx_w); const size_t weights_height = weights->info()->dimension(idx_h); - const bool is_quantized = is_data_type_quantized(input->info()->data_type()); CLBuildOptions build_opts; build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); @@ -166,24 +223,18 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - std::string kernel_name = (is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc"; + std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc"; - if(is_quantized) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION"); if(dwc_info.activation_info.enabled()) { @@ -199,6 +250,9 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); } else { @@ -228,12 +282,15 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, } Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), biases != nullptr ? biases->clone().get() : nullptr, - output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation) + output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr) .first); return Status{}; @@ -255,15 +312,23 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1)); } + unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor(); + + // Set output multipliers in case of quantized data type + if(_is_quantized) + { + add_1D_tensor_argument(idx, _output_multipliers, slice_in); + add_1D_tensor_argument(idx, _output_shifts, slice_in); + } + if(_biases != nullptr) { - unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor(); add_1D_tensor_argument(idx, _biases, slice_in); } do { - unsigned int idx = 0; + idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_4D_tensor_argument(idx, _output, slice_out); add_3D_tensor_argument(idx, _weights, slice_out); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp index 1fd6312295..ec889ec949 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp @@ -47,7 +47,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3); @@ -98,10 +97,10 @@ void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor // Build the kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0)); build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0))); build_opts.add_option_if(info.transpose, "-DTRANSPOSE"); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); _kernel = static_cast(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights", build_opts.options())); } diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp index 72f2ca40f5..7010dffd25 100644 --- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp @@ -37,7 +37,8 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -139,21 +140,7 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor * build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1))); build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); std::string kernel_name("gemm_reshape_lhs_matrix_"); kernel_name += lhs_info.transpose ? "t" : "nt"; @@ -219,4 +206,5 @@ void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &q enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp index 2ca4132b15..6f6019d26a 100644 --- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp @@ -37,7 +37,8 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -118,21 +119,7 @@ void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor * build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE"); build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE"); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); std::string kernel_name("gemm_reshape_rhs_matrix_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -169,4 +156,5 @@ void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &q enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp index ea292c0b7b..85917d38dd 100644 --- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp @@ -40,7 +40,8 @@ #include -using namespace arm_compute; +namespace arm_compute +{ namespace { std::pair validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) @@ -102,31 +103,7 @@ void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned // Add build options CLBuildOptions build_opts; - - switch(input->info()->element_size()) - { - case 1: - { - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - } - case 2: - { - build_opts.add_option("-DDATA_TYPE=short"); - break; - } - case 4: - { - build_opts.add_option("-DDATA_TYPE=int"); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported input data type."); - break; - } - } - + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); @@ -164,3 +141,4 @@ void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue add_4D_tensor_argument(idx, _output, window); enqueue(queue, *this, window, lws_hint()); } +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp index 9cb72b3c04..81a810fcb8 100644 --- a/src/core/CL/kernels/CLPermuteKernel.cpp +++ b/src/core/CL/kernels/CLPermuteKernel.cpp @@ -52,11 +52,6 @@ TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector & Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4, "Permutation upto 4-D input tensor is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 84bf5bf874..796f0d068a 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -81,20 +81,7 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const // Set kernel build options CLBuildOptions build_opts; build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0))); - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("reverse", build_opts.options())); diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index d9e05d7ee8..7e1af0e27d 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -292,6 +292,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const converted_string = ss.str(); break; case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: // Needs conversion to 32 bit, otherwise interpreted as ASCII values ss << int32_t(value.get()); converted_string = ss.str(); @@ -437,6 +438,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const print_consecutive_elements_impl(s, ptr, n, stream_width, element_delim); break; case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; case DataType::U16: @@ -473,6 +475,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp case DataType::QASYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, ptr, n); case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::U16: case DataType::QASYMM16: diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 42bd84db47..cdd48972eb 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Helpers.h" #include #include @@ -134,5 +135,26 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty } return std::make_pair(min_quant_val, max_quant_val); } +void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr) +{ + const unsigned int idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int num_filters = is_data_type_quantized_per_channel(weights->info()->data_type()) ? weights->info()->dimension(idx_c) : 1; + + const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); + const QuantizationInfo wq_info = weights->info()->quantization_info(); + const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + + for(unsigned int i = 0; i < num_filters; ++i) + { + int output_multiplier = 0; + int output_shift = 0; + const float multiplier = iq_info.scale * wq_info.scale()[i] / oq_info.scale; + ARM_COMPUTE_ERROR_ON(multiplier > 1.0f); + calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + + output_multipliers_ptr[i] = output_multiplier; + output_shifts_ptr[i] = output_shift; + } +} } // quantization } // arm_compute -- cgit v1.2.1