From 0841ca085301e8ddbc9627b2be55758b66437c15 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 1 Feb 2021 14:37:02 +0000 Subject: Fix OpenCL direct convolution - The ARM DOT macro was using wrong variables for performing the dot product - K0 could be a non power of 2 values when IFM was not a multiple of 16 - Refactor the test for direct convolution NHWC Resolves COMPMID-4135, COMPMID-4155 Change-Id: I3a2dc89ef613ae20245cfc28e76ea36c55eaf81d Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4962 Comments-Addressed: Arm Jenkins Reviewed-by: TeresaARM Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/direct_convolution.cl | 34 +++++++++------------- .../CL/kernels/CLDirectConvolutionLayerKernel.cpp | 2 +- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl index 87f8153118..5d2a24e740 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -34,24 +34,24 @@ #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x, y, val) \ ({ \ - val += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b.s0; \ - val += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b.s1; \ - val += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b.s2; \ - val += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b.s3; \ + val += (ACC_DATA_TYPE)x.s0 * (ACC_DATA_TYPE)y.s0; \ + val += (ACC_DATA_TYPE)x.s1 * (ACC_DATA_TYPE)y.s1; \ + val += (ACC_DATA_TYPE)x.s2 * (ACC_DATA_TYPE)y.s2; \ + val += (ACC_DATA_TYPE)x.s3 * (ACC_DATA_TYPE)y.s3; \ }) #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) -#define ARM_DOT1(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0), c); \ +#define ARM_DOT1(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0)), c); \ }) -#define ARM_DOT2(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0), c); \ +#define ARM_DOT2(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0)), c); \ }) -#define ARM_DOT3(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0), c); \ +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0)), c); \ }) #define ARM_DOT4(a, b, c) \ ({ \ @@ -539,14 +539,12 @@ __kernel void direct_convolution_nhwc( TENSOR_DOT(K0, 0); -#undef TENSOR_DOT - wei_offset += K0 * sizeof(WEI_DATA_TYPE); } #if(SRC_CHANNELS % K0) != 0 // Left-over accumulations - for(; i < SRC_CHANNELS; ++i) + for(; k < SRC_CHANNELS; ++k) { // Load values from src tensor LOAD_BLOCK_INDIRECT(M0, 1, SRC_DATA_TYPE, a, src_ptr, src_offset + k * sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask); @@ -554,10 +552,6 @@ __kernel void direct_convolution_nhwc( // Load values from weights tensor LOAD_BLOCK(N0, 1, WEI_DATA_TYPE, b, wei_ptr, wei_offset, wei_stride_w, zero); -#define TENSOR_DOT(i) \ - ARM_DOT_K0XN0(1, a##i, b, c##i); \ - ARM_OFFSET_K0XN0(1, a##i, b, SRC_OFFSET, WEI_OFFSET, c##i); - TENSOR_DOT(1, 0); #undef TENSOR_DOT diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 3ca72b3e5d..3948261bc3 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -375,7 +375,7 @@ void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_c const unsigned int n0 = win_config.second.x().step(); const unsigned int m0 = win_config.second.y().step(); - const unsigned int k0 = std::min(static_cast(_input->info()->dimension(channel_idx)), 16u); + const unsigned int k0 = adjust_vec_size(16u, _input->info()->dimension(channel_idx)); const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0; const unsigned int partial_store_m0 = (_output->info()->dimension(width_idx) * _output->info()->dimension(height_idx)) % m0; const unsigned int pad_left = conv_info.pad_left(); -- cgit v1.2.1