aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2019-02-27 13:32:51 +0000
committerGiuseppe Rossini <giuseppe.rossini@arm.com>2019-03-08 10:41:25 +0000
commit471043616a869f0e696c8db4e1d0a62b45b4decf (patch)
tree79f3386c4d6757472835003fbf552566213f3bce
parentadc5395ad72aceb2c9e7e6beb54d949959d35143 (diff)
downloadComputeLibrary-471043616a869f0e696c8db4e1d0a62b45b4decf.tar.gz
COMPMID-1882: Improve memory coalescence when reshaping the weights for CLDepthwiseConvolution
Change-Id: I97788d9e349f37fcd818d588d668e2d5e22fd568 Signed-off-by: giuros01 <giuseppe.rossini@arm.com> Reviewed-on: https://review.mlplatform.org/c/818 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/core/CL/CLKernelLibrary.cpp10
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl96
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp16
4 files changed, 76 insertions, 51 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a86d8d0414..7476e19011 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -217,11 +217,11 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" },
- { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_nhwc", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_dot8_nchw", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_dot8_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
{ "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 606af2edb1..503aa7e837 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -117,7 +117,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_nchw(
+__kernel void dwc_3x3_native_qasymm8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -254,7 +254,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
#else // defined(REAL_MULTIPLIER)
- values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -271,7 +271,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
#else // defined(REAL_MULTIPLIER)
- values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -349,7 +349,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
+__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -597,9 +597,10 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -640,7 +641,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc(
+__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -654,9 +655,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -682,15 +683,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
VEC_INT acc = 0, sum = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -766,7 +771,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
#else // defined(REAL_MULTIPLIER)
- acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc += (VEC_INT)OUTPUT_OFFSET;
@@ -785,9 +790,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -829,7 +835,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -843,9 +849,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -874,15 +880,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
VEC_INT acc3 = 0, sum3 = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1020,10 +1030,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
@@ -1110,7 +1120,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset The maximum allowed offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -1124,9 +1134,9 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1255,8 +1265,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc0 += (VEC_INT)OUTPUT_OFFSET;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index e8efe6f0a9..770740d180 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -171,7 +171,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
{
const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
- kernel_name = is_qasymm ? (std::string("depthwise_convolution_3x3_quantized") + (is_dot8_supported ? "_dot8" : "") + "_nchw") : "depthwise_convolution_3x3";
+ kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3";
+ kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
+ kernel_name += "_nchw";
+
num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1) ? 2 : 1;
num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index a82d3a6fae..431039c31f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -252,9 +252,21 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
"-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
+ std::string kernel_name;
// Create kernel
- std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
- && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+ if(is_qasymm)
+ {
+ kernel_name = std::string("dwc_3x3_reshaped_qasymm8");
+ kernel_name += (is_dot8_supported && is_stride_1 ? "_dot8" : "");
+ kernel_name += (is_stride_1 ? "_stride1" : "");
+ kernel_name += "_nhwc";
+ }
+ else
+ {
+ kernel_name = std::string("depthwise_convolution_3x3_nhwc");
+ kernel_name += (is_stride_1 ? "_stride1" : "");
+ }
+
ICLKernel::configure_internal(win_config.second);
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));