From fa23f1102f3e2d41838b8a9b53ab74c24cea5b50 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 19 Jun 2018 11:27:38 +0100 Subject: COMPMID-1246 CLDepthwiseConvolution QASYMM8 NHWC kernel cleanup Change-Id: If9385e6bcbf2242b973f42d6979b16ebc39f2cb4 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/136159 Tested-by: Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Gian Marco Iodice --- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.h | 1 + .../ICLDepthwiseConvolutionLayer3x3Kernel.h | 3 +- .../cl_kernels/depthwise_convolution_quantized.cl | 294 ++++++++++----------- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 2 +- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 12 +- 5 files changed, 146 insertions(+), 166 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h index 59cdf339bd..b1c730d9a7 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h @@ -73,6 +73,7 @@ public: private: unsigned int _conv_stride_x; unsigned int _conv_pad_top; + unsigned int _conv_pad_left; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H__ */ diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h index 15233c5c32..3396de2e46 100644 --- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h @@ -37,7 +37,7 @@ class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel public: /** Default constructor */ ICLDepthwiseConvolutionLayer3x3Kernel() - : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _conv_pad_left(0) + : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1) { } /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -69,7 +69,6 @@ protected: const ICLTensor *_weights; const ICLTensor *_biases; unsigned int _conv_stride_y; - unsigned int _conv_pad_left; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index ccb3a1ffe2..88e009d678 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -252,29 +252,24 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( #endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) */ -#if defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ) +#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE) #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) #define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE) -#define BIFROST_MAD_4(acc, x, y) \ - ({ \ - acc.s0 += (ushort)x.s0 * (ushort)y.s0; \ - acc.s1 += (ushort)x.s1 * (ushort)y.s1; \ - acc.s2 += (ushort)x.s2 * (ushort)y.s2; \ - acc.s3 += (ushort)x.s3 * (ushort)y.s3; \ - }) +#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT) #if WEIGHTS_OFFSET != 0 -#define BIFROST_MAD_ACC_4(acc, sum, x, y) \ - ({ \ - sum += CONVERT(x, VEC_INT); \ - BIFROST_MAD_4(acc, x, y); \ +#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \ + ({ \ + sum += CONVERT(x, VEC_INT); \ + MULTIPLY_ADD(x, y, acc); \ }) #else /* WEIGHTS_OFFSET != 0 */ -#define BIFROST_MAD_ACC_4(acc, sum, x, y) BIFROST_MAD_4(acc, x, y) +#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc) #endif /* WEIGHTS_OFFSET != 0 */ /** This function computes the depthwise convolution quantized. @@ -318,6 +313,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( #endif /* defined(HAS_BIAS) */ ) { + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); Vector weights = CONVERT_TO_VECTOR_STRUCT(weights); #if defined(HAS_BIAS) @@ -326,20 +325,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr); #endif /* defined(HAS_BIAS) */ - __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes; - - const int z = get_global_id(2); - const int pad_offs = -ROWS_READ * src_stride_y; - const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z; - const int src_offs1 = src_offs0 + src_stride_z; - const int src_offs2 = src_offs1 + src_stride_z; - - const int cond_top = z - CONV_PAD_TOP < 0; - const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH; - - __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top); - __global uchar *src_addr1 = first_elem + src_offs1; - __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom); + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x; + int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT; + int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP; VEC_INT sum_we = 0; VEC_INT acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; @@ -355,34 +343,34 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + int valid_z = z_coord; + int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1 + valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2 + valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate + + VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); - BIFROST_MAD_ACC_4(acc1, sum1, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc1, sum1, values, w1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc1, sum1, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); - BIFROST_MAD_ACC_4(acc3, sum3, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); - BIFROST_MAD_ACC_4(acc3, sum3, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc3, sum3, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3); weights.ptr += weights_stride_z; @@ -395,34 +383,33 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + // Only unit pad_top/bottom allowed, this can never be out of bound + valid_z = z_coord + 1; + valid_y = y_coord; - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); - BIFROST_MAD_ACC_4(acc1, sum1, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc1, sum1, values, w1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc1, sum1, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); - BIFROST_MAD_ACC_4(acc3, sum3, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); - BIFROST_MAD_ACC_4(acc3, sum3, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc3, sum3, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3); + + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3); weights.ptr += weights_stride_z; @@ -435,34 +422,34 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + valid_z = z_coord + 2; + valid_y = select(y_coord, -1, (int8)valid_z < 0); + valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); + valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); + + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); - BIFROST_MAD_ACC_4(acc1, sum1, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc1, sum1, values, w1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc1, sum1, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); - BIFROST_MAD_ACC_4(acc3, sum3, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); - BIFROST_MAD_ACC_4(acc3, sum3, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc3, sum3, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3); #if defined(HAS_BIAS) acc0 += bias_values; @@ -565,6 +552,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( #endif /* defined(HAS_BIAS) */ ) { + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); Vector weights = CONVERT_TO_VECTOR_STRUCT(weights); #if defined(HAS_BIAS) @@ -573,20 +564,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr); #endif /* defined(HAS_BIAS) */ - __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes; - - const int z = get_global_id(2); - const int pad_offs = -ROWS_READ * src_stride_y; - const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z; - const int src_offs1 = src_offs0 + src_stride_z; - const int src_offs2 = src_offs1 + src_stride_z; - - const int cond_top = z - CONV_PAD_TOP < 0; - const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH; - - __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top); - __global uchar *src_addr1 = first_elem + src_offs1; - __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom); + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x; + int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT; + int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP; VEC_INT sum_we = 0; VEC_INT acc0 = 0, acc2 = 0; @@ -602,25 +582,26 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + int valid_z = z_coord; + int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1 + valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2 + valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); + VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr0 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr0); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); + + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); weights.ptr += weights_stride_z; @@ -633,25 +614,25 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + // Only unit pad_top/bottom allowed, this can never be out of bound + valid_z = z_coord + 1; + valid_y = y_coord; + + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); - src_addr1 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr1); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); weights.ptr += weights_stride_z; @@ -664,25 +645,26 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT); #endif /* INPUT_OFFSET != 0 */ - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w0); + valid_z = z_coord + 2; + valid_y = select(y_coord, -1, (int8)valid_z < 0); + valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); + valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); + + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc0, sum0, values, w2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w0); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0); + MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w1); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2); - src_addr2 += src_stride_y; - values = VLOAD(VEC_SIZE)(0, src_addr2); - BIFROST_MAD_ACC_4(acc2, sum2, values, w2); + values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z); + MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2); #if defined(HAS_BIAS) acc0 += bias_values; @@ -721,6 +703,6 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride2( (res2, 0, dst.ptr + 1 * dst_stride_y); } -#endif /* defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ) */ +#endif /* defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) */ #endif /* defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */ diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index d5b34c39cb..752a810520 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -195,7 +195,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel() - : _conv_stride_x(0), _conv_pad_top(0) + : _conv_stride_x(0), _conv_pad_top(0), _conv_pad_left(0) { } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index a54e92c63a..d24ef0f496 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -77,7 +77,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2; const unsigned int num_rows_written_per_iteration = num_rows_processed_per_iteration / conv_info.stride().first; - const BorderSize border_size(conv_info.pad_left() + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0); + const BorderSize border_size(std::max(conv_info.pad_left(), conv_info.pad_top()), 0, std::max(conv_info.pad_right(), conv_info.pad_bottom()), 0); // Configure kernel window Window win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration)); @@ -140,13 +140,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _weights = weights; _biases = biases; _conv_stride_y = conv_info.stride().second; - _conv_pad_left = conv_info.pad_left(); _num_rows_processed_per_iteration = 4; const unsigned int num_elems_accessed_per_iteration = 4; - const unsigned int num_rows_read_per_iteration = _num_rows_processed_per_iteration + 2; - _border_size = BorderSize(_conv_pad_left + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0); + _border_size = BorderSize(std::max(conv_info.pad_left(), conv_info.pad_top()), 0, std::max(conv_info.pad_right(), conv_info.pad_bottom()), 0); float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale; int output_multiplier = 0; @@ -162,9 +160,10 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration)); - build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2))); + build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1))); + build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2))); build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); - build_opts.add_option("-DROWS_READ=" + support::cpp11::to_string(num_rows_read_per_iteration)); + build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); if(act_info.enabled()) { @@ -236,7 +235,6 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com // Create input window and adjust Window win_in = window; - win_in.adjust(Window::DimY, -_conv_pad_left, true); win_in.set_dimension_step(Window::DimY, _num_rows_processed_per_iteration); win_in.set_dimension_step(Window::DimZ, _conv_stride_y); -- cgit v1.2.1