aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2019-02-27 13:32:51 +0000
committerGiuseppe Rossini <giuseppe.rossini@arm.com>2019-03-08 10:41:25 +0000
commit471043616a869f0e696c8db4e1d0a62b45b4decf (patch)
tree79f3386c4d6757472835003fbf552566213f3bce /src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
parentadc5395ad72aceb2c9e7e6beb54d949959d35143 (diff)
downloadComputeLibrary-471043616a869f0e696c8db4e1d0a62b45b4decf.tar.gz
COMPMID-1882: Improve memory coalescence when reshaping the weights for CLDepthwiseConvolution
Change-Id: I97788d9e349f37fcd818d588d668e2d5e22fd568 Signed-off-by: giuros01 <giuseppe.rossini@arm.com> Reviewed-on: https://review.mlplatform.org/c/818 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl96
1 files changed, 53 insertions, 43 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 606af2edb1..503aa7e837 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -117,7 +117,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_nchw(
+__kernel void dwc_3x3_native_qasymm8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -254,7 +254,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
#else // defined(REAL_MULTIPLIER)
- values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -271,7 +271,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
#else // defined(REAL_MULTIPLIER)
- values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -349,7 +349,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
+__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -597,9 +597,10 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -640,7 +641,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc(
+__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -654,9 +655,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -682,15 +683,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
VEC_INT acc = 0, sum = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -766,7 +771,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
#else // defined(REAL_MULTIPLIER)
- acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc += (VEC_INT)OUTPUT_OFFSET;
@@ -785,9 +790,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -829,7 +835,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -843,9 +849,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -874,15 +880,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
VEC_INT acc3 = 0, sum3 = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1020,10 +1030,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
@@ -1110,7 +1120,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset The maximum allowed offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -1124,9 +1134,9 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1255,8 +1265,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc0 += (VEC_INT)OUTPUT_OFFSET;