From 9f7d55a3566b0f1044110000b033d663b26d3a6c Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Mon, 8 Feb 2021 13:20:24 +0000 Subject: Fix CLDepthwiseConvolutionLayer 3x3 QASYMM8 Fix errors when computing tensors with one element only - Replace Tensor3D with raw pointers so to get rid of offset to first element for NCHW layout - Add stronger out of bound constraints for NHWC layout - Set the border size to the input's padding for NHWC - Fill the strides == 0 with the largest stride, so to avoid accessing empty strides and multiplying by 0 Resolve COMPMID-4088 Change-Id: I751a4e6d7094b3c42306ff7f53af848fd35f19ac Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5024 Tested-by: Arm Jenkins Reviewed-by: Manuel Bottini Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../cl_kernels/depthwise_convolution_quantized.cl | 44 ++++++++++++---------- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 4 +- .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp | 6 +-- .../CLGEMMLowpMatrixMultiplyReshapedKernel.cpp | 6 +-- ...GEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp | 6 +-- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 6 +-- .../kernels/CLGEMMMatrixMultiplyNativeKernel.cpp | 6 +-- .../kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp | 6 +-- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 6 +-- src/core/helpers/Utils.h | 26 ++++++++++++- tests/validation/UNIT/TensorInfo.cpp | 12 +++--- 11 files changed, 78 insertions(+), 50 deletions(-) diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index d39089b923..285c00a713 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -194,11 +194,11 @@ __kernel void dwc_3x3_native_quantized8_nchw( #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); - Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); + __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z; + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -211,7 +211,7 @@ __kernel void dwc_3x3_native_quantized8_nchw( #endif //defined(HAS_BIAS) // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER) - src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; + src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; VEC_DATA_TYPE(WEIGHTS_TYPE, 3) @@ -235,7 +235,7 @@ __kernel void dwc_3x3_native_quantized8_nchw( // Row0 int8 left, middle, right; - GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right); + GET_VALUES(src_addr + 0 * src_stride_y, left, middle, right); values0 += left * (int8)(w0.s0); values0 += middle * (int8)(w0.s1); values0 += right * (int8)(w0.s2); @@ -245,10 +245,11 @@ __kernel void dwc_3x3_native_quantized8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ // Row1 - GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right); + GET_VALUES(src_addr + DILATION_Y * src_stride_y, left, middle, right); values0 += left * (int8)(w1.s0); values0 += middle * (int8)(w1.s1); values0 += right * (int8)(w1.s2); + #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 values1 += left * (int8)(w0.s0); values1 += middle * (int8)(w0.s1); @@ -264,7 +265,7 @@ __kernel void dwc_3x3_native_quantized8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ // Row2 - GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right); + GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left, middle, right); values0 += left * (int8)(w2.s0); values0 += middle * (int8)(w2.s1); values0 += right * (int8)(w2.s2); @@ -284,7 +285,7 @@ __kernel void dwc_3x3_native_quantized8_nchw( #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 // Row3 - GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right); + GET_VALUES(src_addr + 3 * src_stride_y, left, middle, right); values1 += left * (int8)(w2.s0); values1 += middle * (int8)(w2.s1); values1 += right * (int8)(w2.s2); @@ -511,11 +512,11 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); - Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); + __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z; + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -528,7 +529,7 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #endif //defined(HAS_BIAS) // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER) - src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; + src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; VEC_TYPE(3) @@ -551,9 +552,9 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( int8 values0 = 0; int8 sum0 = 0; - GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0); - GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1); - GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2); + GET_VALUES(src_addr + 0 * src_stride_y, left0, middle0, right0); + GET_VALUES(src_addr + DILATION_Y * src_stride_y, left1, middle1, right1); + GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2); #if WEIGHTS_OFFSET != 0 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0); @@ -569,7 +570,7 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( int8 values1 = 0; int8 sum1 = 0; - GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3); + GET_VALUES(src_addr + 3 * src_stride_y, left3, middle3, right3); #if WEIGHTS_OFFSET != 0 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1); @@ -923,7 +924,9 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc( // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y; + z_coord = min((uint)z_coord, (uint)SRC_DIM_2); offset = y_offset + (int4)(z_coord * src_stride_z); + offset = min(offset, (int4)max_offset); VEC_TYPE(VEC_SIZE) values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); VEC_TYPE(VEC_SIZE) @@ -934,6 +937,7 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc( // z == 2 // Offset can be out-of-bound so we need to check if it is greater than max_offset z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2; + z_coord = min((uint)z_coord, (uint)SRC_DIM_2); offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); VEC_TYPE(VEC_SIZE) diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index f553fd1849..43c3ff3bfd 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -233,7 +233,7 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext if(_is_quantized) { - _border_size = BorderSize(is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); + _border_size = BorderSize(input->info()->padding()); // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1 if(is_dot8_supported) diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp index 1f89865908..5633ee5a28 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -284,8 +284,8 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::Command if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp index ded4b29ae7..3043e01514 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -254,8 +254,8 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::Comma if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp index 77cea24829..0122e3ba4b 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -488,8 +488,8 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 2419104fba..a653e29f8f 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -484,8 +484,8 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp index 387f1a4ebc..fefcd2f74d 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -348,8 +348,8 @@ void CLGEMMMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueu if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp index 23e18bac92..8a403555f5 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -359,8 +359,8 @@ void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQu if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } Window slice = window.first_slice_window_3D(); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 1f296f8e26..de986de9f6 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -352,8 +352,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co if(_input1->info()->num_dimensions() < 3) { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + // The stride_w for matrix B must be the same as stride_z if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != _input1->info()->strides_in_bytes()[2]); } const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u; diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h index 3c3b2b93f9..d64eddb9aa 100644 --- a/src/core/helpers/Utils.h +++ b/src/core/helpers/Utils.h @@ -1,5 +1,5 @@ /* -* Copyright (c) 2020 Arm Limited. +* Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,30 @@ inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fix strides.set(i, shape[i - 1] * strides[i - 1]); } + size_t first_zero = std::distance(strides.begin(), std::find_if(strides.begin(), strides.end(), [](uint32_t val) + { + return val == 0U; + })); + + if(first_zero > 0) + { + if(first_zero == 1) + { + strides.set(1, strides[0] * (shape[0] + info.padding().left + info.padding().right)); + ++first_zero; + } + else if(first_zero == 2) + { + strides.set(2, strides[1] * (shape[1] + info.padding().top + info.padding().bottom)); + ++first_zero; + } + + for(size_t i = first_zero; i < Strides::num_max_dimensions; ++i) + { + strides.set(i, strides[first_zero - 1]); + } + } + return strides; } diff --git a/tests/validation/UNIT/TensorInfo.cpp b/tests/validation/UNIT/TensorInfo.cpp index b5928cc277..44c9342389 100644 --- a/tests/validation/UNIT/TensorInfo.cpp +++ b/tests/validation/UNIT/TensorInfo.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,11 +60,11 @@ DATA_TEST_CASE(AutoPadding, framework::DatasetMode::ALL, zip(zip(zip( PaddingSize{ 4, 36, 4, 4 }})), framework::dataset::make("Strides", { Strides{}, - Strides{ 1U, 50U }, - Strides{ 1U, 50U }, - Strides{ 1U, 50U, 900U }, - Strides{ 1U, 50U, 900U, 9000U }, - Strides{ 1U, 50U, 900U, 9000U, 90000U }, + Strides{ 1U, 50U, 50U, 50U, 50U, 50U }, + Strides{ 1U, 50U, 900U, 900U, 900U, 900U }, + Strides{ 1U, 50U, 900U, 900U, 900U, 900U }, + Strides{ 1U, 50U, 900U, 9000U, 9000U, 9000U }, + Strides{ 1U, 50U, 900U, 9000U, 90000U, 90000U }, Strides{ 1U, 50U, 900U, 9000U, 90000U, 900000U }})), framework::dataset::make("Offset", { 0U, 4U, 204U, 204U, 204U, 204U, 204U })), shape, auto_padding, strides, offset) -- cgit v1.2.1