From 471043616a869f0e696c8db4e1d0a62b45b4decf Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 27 Feb 2019 13:32:51 +0000 Subject: COMPMID-1882: Improve memory coalescence when reshaping the weights for CLDepthwiseConvolution Change-Id: I97788d9e349f37fcd818d588d668e2d5e22fd568 Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/818 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/core/CL/CLKernelLibrary.cpp | 10 +-- .../cl_kernels/depthwise_convolution_quantized.cl | 96 ++++++++++++---------- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 5 +- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 16 +++- 4 files changed, 76 insertions(+), 51 deletions(-) diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index a86d8d0414..7476e19011 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -217,11 +217,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" }, - { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" }, - { "depthwise_convolution_3x3_quantized_nhwc", "depthwise_convolution_quantized.cl" }, - { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" }, - { "depthwise_convolution_3x3_quantized_dot8_nchw", "depthwise_convolution_quantized.cl" }, - { "depthwise_convolution_3x3_quantized_dot8_nhwc_stride1", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" }, diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 606af2edb1..503aa7e837 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -117,7 +117,7 @@ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void depthwise_convolution_3x3_quantized_nchw( +__kernel void dwc_3x3_native_qasymm8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights) @@ -254,7 +254,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); #endif // defined(REAL_MULTIPLIER) @@ -271,7 +271,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); #endif // defined(REAL_MULTIPLIER) @@ -349,7 +349,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void depthwise_convolution_3x3_quantized_dot8_nchw( +__kernel void dwc_3x3_native_qasymm8_dot8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights) @@ -597,9 +597,10 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) -#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) +#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1. * + * @note This kernel assumes VEC_SIZE is 4. * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel. * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2) * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112) @@ -640,7 +641,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector * @param[in] max_offset Max offset for the input tensor */ -__kernel void depthwise_convolution_3x3_quantized_nhwc( +__kernel void dwc_3x3_reshaped_qasymm8_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), @@ -654,9 +655,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( #if defined(DST_DEPTH) int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch -#else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y -#endif // defined(DST_DEPTH) +#else // defined(DST_DEPTH) + int z = get_global_id(2); // spatial coordinate y +#endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -682,15 +683,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( VEC_INT acc = 0, sum = 0; // Load weights - VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0); - VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE); - VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE); - VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE); - VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE); - VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE); - VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE); - VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE); - VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE); + uchar16 w0_tmp = VLOAD(16)(0, weights_addr); + uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); + uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); + + uchar4 w0 = w0_tmp.s0123; + uchar4 w1 = w0_tmp.s4567; + uchar4 w2 = w0_tmp.s89AB; + uchar4 w3 = w0_tmp.sCDEF; + + uchar4 w4 = w1_tmp.s0123; + uchar4 w5 = w1_tmp.s4567; + uchar4 w6 = w1_tmp.s89AB; + uchar4 w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -766,7 +771,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( #else // defined(REAL_MULTIPLIER) - acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); #endif // defined(REAL_MULTIPLIER) acc += (VEC_INT)OUTPUT_OFFSET; @@ -785,9 +790,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( } #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) -#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) +#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1. * + * @note This kernel assumes VEC_SIZE is 4. * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel. * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2) * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112) @@ -829,7 +835,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc( * @param[in] max_offset Max offset for the input tensor */ -__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( +__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), @@ -843,9 +849,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( #if defined(DST_DEPTH) int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch -#else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y -#endif // defined(DST_DEPTH) +#else // defined(DST_DEPTH) + int z = get_global_id(2); // spatial coordinate y +#endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -874,15 +880,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( VEC_INT acc3 = 0, sum3 = 0; // Load weights - VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0); - VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE); - VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE); - VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE); - VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE); - VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE); - VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE); - VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE); - VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE); + uchar16 w0_tmp = VLOAD(16)(0, weights_addr); + uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); + uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); + + uchar4 w0 = w0_tmp.s0123; + uchar4 w1 = w0_tmp.s4567; + uchar4 w2 = w0_tmp.s89AB; + uchar4 w3 = w0_tmp.sCDEF; + + uchar4 w4 = w1_tmp.s0123; + uchar4 w5 = w1_tmp.s4567; + uchar4 w6 = w1_tmp.s89AB; + uchar4 w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -1020,10 +1030,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( #else // defined(REAL_MULTIPLIER) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); #endif // defined(REAL_MULTIPLIER) @@ -1110,7 +1120,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector * @param[in] max_offset The maximum allowed offset for the input tensor */ -__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( +__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), @@ -1124,9 +1134,9 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( #if defined(DST_DEPTH) int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch -#else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y -#endif // defined(DST_DEPTH) +#else // defined(DST_DEPTH) + int z = get_global_id(2); // spatial coordinate y +#endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -1255,8 +1265,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( #else // defined(REAL_MULTIPLIER) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); #endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index e8efe6f0a9..770740d180 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -171,7 +171,10 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen { const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - kernel_name = is_qasymm ? (std::string("depthwise_convolution_3x3_quantized") + (is_dot8_supported ? "_dot8" : "") + "_nchw") : "depthwise_convolution_3x3"; + kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3"; + kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : ""); + kernel_name += "_nchw"; + num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type()); num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1) ? 2 : 1; num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x; diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index a82d3a6fae..431039c31f 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -252,9 +252,21 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast(std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration))))); + std::string kernel_name; // Create kernel - std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported - && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : ""); + if(is_qasymm) + { + kernel_name = std::string("dwc_3x3_reshaped_qasymm8"); + kernel_name += (is_dot8_supported && is_stride_1 ? "_dot8" : ""); + kernel_name += (is_stride_1 ? "_stride1" : ""); + kernel_name += "_nhwc"; + } + else + { + kernel_name = std::string("depthwise_convolution_3x3_nhwc"); + kernel_name += (is_stride_1 ? "_stride1" : ""); + } + ICLKernel::configure_internal(win_config.second); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); -- cgit v1.2.1