From 30124354c6848c49f9740d1944d2445782255a85 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Thu, 2 Dec 2021 19:12:20 +0000 Subject: Remove padding from ClDirectConv2dKernel - Delete old NCHW ClDirectConv2d kernels. - Merge all kernels on a single file. - Removed padding from ClDirectConv2dKernel Resolves COMPMID-4721 Signed-off-by: Adnan AlSinan Change-Id: I624d218fb770e7b5f3c0acd4e85a21ae48470f55 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6779 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Gian Marco Iodice --- Android.bp | 5 +- SConscript | 5 +- src/core/CL/cl_kernels/nchw/direct_convolution.cl | 147 +++++++++ .../CL/cl_kernels/nchw/direct_convolution1x1.cl | 316 ------------------- .../CL/cl_kernels/nchw/direct_convolution3x3.cl | 291 ----------------- .../CL/cl_kernels/nchw/direct_convolution5x5.cl | 313 ------------------- .../nchw/direct_convolution_quantized.cl | 308 ------------------ src/gpu/cl/ClKernelLibrary.cpp | 24 +- src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 343 +++++---------------- src/gpu/cl/kernels/ClDirectConv2dKernel.h | 4 +- src/gpu/cl/operators/ClDirectConv2d.cpp | 2 +- 11 files changed, 224 insertions(+), 1534 deletions(-) create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl diff --git a/Android.bp b/Android.bp index 727e7c11a5..b6006be52a 100644 --- a/Android.bp +++ b/Android.bp @@ -81,10 +81,7 @@ opencl_srcs = [ "src/core/CL/cl_kernels/nchw/channel_shuffle.cl", "src/core/CL/cl_kernels/nchw/depth_to_space.cl", "src/core/CL/cl_kernels/nchw/dequantization_layer.cl", - "src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl", - "src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl", - "src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl", - "src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl", + "src/core/CL/cl_kernels/nchw/direct_convolution.cl", "src/core/CL/cl_kernels/nchw/im2col.cl", "src/core/CL/cl_kernels/nchw/normalization_layer.cl", "src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl", diff --git a/SConscript b/SConscript index a8995aca92..7e901019cb 100644 --- a/SConscript +++ b/SConscript @@ -353,10 +353,7 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl', 'src/core/CL/cl_kernels/nchw/channel_shuffle.cl', 'src/core/CL/cl_kernels/nchw/depth_to_space.cl', - 'src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl', - 'src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl', - 'src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl', - 'src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl', + 'src/core/CL/cl_kernels/nchw/direct_convolution.cl', 'src/core/CL/cl_kernels/nchw/dequantization_layer.cl', 'src/core/CL/cl_kernels/nchw/im2col.cl', 'src/core/CL/cl_kernels/nchw/normalization_layer.cl', diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution.cl b/src/core/CL/cl_kernels/nchw/direct_convolution.cl new file mode 100644 index 0000000000..866f62da95 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution.cl @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "helpers_asymm.h" + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 + * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 + * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 + * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3 + * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + const int id0 = get_global_id(0); + const int id1 = get_global_id(1); + const int id2 = get_global_id(2); + + const int x_coords = (id0 * STRIDE_X) - PAD_LEFT; + const int y_coords = (id1 * STRIDE_Y) - PAD_TOP; + + const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE); + + __global uchar *src_addr = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes); + __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w); + __global uchar *dst_addr = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z; + +#ifdef IS_QUANTIZED + int acc_value = 0; +#else /* IS_QUANTIZED */ + DATA_TYPE acc_value = 0; +#endif /* IS_QUANTIZED */ + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + for(int y = 0; y < WEI_HEIGHT; ++y) + { + for(int x = 0; x < WEI_WIDTH; ++x) + { + const int idx_x = (x_coords + x); + const int idx_y = (y_coords + y); + if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT)) + { + const int weight_offset = x + (WEI_HEIGHT * y); + const int input_offset = idx_x + SRC_WIDTH * idx_y; +#ifdef IS_QUANTIZED + int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset)); + int input = convert_int(*((__global DATA_TYPE *)src_addr + input_offset)); + acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET); +#else /* IS_QUANTIZED */ + DATA_TYPE weight = *((__global DATA_TYPE *)weights_addr + weight_offset); + DATA_TYPE input = *((__global DATA_TYPE *)src_addr + input_offset); + acc_value += input * weight; +#endif /* IS_QUANTIZED */ + } + } + } + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#ifdef IS_QUANTIZED + int bias = *((__global int *)(vector_offset(&biases, id2))); +#else /* IS_QUANTIZED */ + DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2))); +#endif /* IS_QUANTIZED */ + acc_value += bias; + +#endif /* defined(HAS_BIAS) */ + +#ifdef IS_QUANTIZED + +#if OUTPUT_SHIFT < 0 + acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1); +#else // OUTPUT_SHIFT < 0 + acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1); +#endif // OUTPUT_SHIFT < 0 + acc_value = acc_value + OUTPUT_OFFSET; +#endif /* IS_QUANTIZED */ + + *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE); +} \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl deleted file mode 100644 index 8ab2d1d4ea..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#define ADD_OP(a, b) ((a) + (b)) -#define MUL_OP(a, b) ((a) * (b)) -#define CONVERT_SAT(a, b) ((a)) - -#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 3 -#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size -#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size) -#elif STRIDE_X == 2 -#define INPUT_PIXEL(data_size) extract_input_stride2 -#elif STRIDE_X == 1 -#define INPUT_PIXEL(data_size) extract_input_stride1 -#else /* STRIDE_X not equals 1, 2 or 3 */ -#error "Only support strides 1, 2 and 3" -#endif /* STRIDE_X == 3 */ - -/** Extracts a 1D horizontal vector from the input tensor with stride as 1. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel) -{ - return vload8(0, input_pixel); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 2. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp = vload16(0, input_pixel); - return temp.s02468ace; -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 4) - temp1 = vload4(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp2 = vload4(0, input_pixel + 6); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp3 = vload4(0, input_pixel + 12); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp4 = vload4(0, input_pixel + 18); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp1 = vload8(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp2 = vload8(0, input_pixel + 8); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp3 = vload8(0, input_pixel + 16); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp1 = vload16(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 16) - temp2 = vload16(0, input_pixel + 12); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); -} - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 - * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution1x1( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); -#endif /* defined(HAS_BIAS) */ - - VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) - values = 0; - - const uint z_index = get_global_id(2); - - weights.ptr += z_index * weights_stride_w; - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr; - VEC_DATA_TYPE(DATA_TYPE, 8) - input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr); - values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel)); - src.ptr += src_stride_z; - weights.ptr += weights_stride_z; - } - -#ifdef HAS_BIAS - values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index)))); -#endif /* defined(HAS_BIAS) */ - - vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \ - ({ \ - acc.s0 = mad(src.s0, weight_value, acc.s0); \ - acc.s1 = mad(src.s1, weight_value, acc.s1); \ - acc.s2 = mad(src.s2, weight_value, acc.s2); \ - acc.s3 = mad(src.s3, weight_value, acc.s3); \ - }) - -/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases, -DHAS_BIAS must to be passed at compile - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution1x1_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 acc0 = 0.0f; - float4 acc1 = 0.0f; - float4 acc2 = 0.0f; - float4 acc3 = 0.0f; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights - float weight = *((__global float *)weights_addr); - - // Load values from row0 of input tensor - float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); - float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); - float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); - float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); - - CONVOLUTION1x1_BIFROST(acc0, src0, weight); - CONVOLUTION1x1_BIFROST(acc1, src1, weight); - CONVOLUTION1x1_BIFROST(acc2, src2, weight); - CONVOLUTION1x1_BIFROST(acc3, src3, weight); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); - - acc0.s0 += bias; - acc0.s1 += bias; - acc0.s2 += bias; - acc0.s3 += bias; - acc1.s0 += bias; - acc1.s1 += bias; - acc1.s2 += bias; - acc1.s3 += bias; - acc2.s0 += bias; - acc2.s1 += bias; - acc2.s2 += bias; - acc2.s3 += bias; - acc3.s0 += bias; - acc3.s1 += bias; - acc3.s2 += bias; - acc3.s3 += bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); - vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); - vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl deleted file mode 100644 index 811df053c4..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#define ADD_OP(a, b) ((a) + (b)) -#define MUL_OP(a, b) ((a) * (b)) -#define CONVERT_SAT(a, b) ((a)) - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 1 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X == 2 */ - -#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 3) \ - weights_values0 = vload3(0, weights_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 8) \ - src0 = vload8(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 2) \ - src1 = vload2(0, src_row_ptr + 8); \ - \ - acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ - }) - -#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 3) \ - weights_values0 = vload3(0, weights_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 16) \ - src0 = vload16(0, src_row_ptr); \ - DATA_TYPE src1 = *(src_row_ptr + 16); \ - \ - acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ - }) - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note This OpenCL kernel works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution3x3( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) - values0 = 0; - - __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)))); -#endif /* defined(HAS_BIAS) */ - - vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \ - ({ \ - acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \ - acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \ - acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \ - acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \ - acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \ - acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \ - acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \ - acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \ - acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \ - acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \ - acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \ - acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \ - }) - -/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases, -DHAS_BIAS must to be passed at compile - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution3x3_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 values0 = 0; - float4 values1 = 0; - float4 values2 = 0; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights - float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y)); - float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y)); - float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y)); - float4 src0; - float2 src1; - - // Load values from row0 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4); - - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0); - - // Load values from row1 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1); - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0); - - // Load values from row2 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2); - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1); - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0); - - // Load values from row3 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2); - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1); - - // Row4 - src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); - - values0 += (float4)bias; - values1 += (float4)bias; - values2 += (float4)bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); - vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl deleted file mode 100644 index 59d668f0bf..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 1 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X == 2 */ - -#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - weights_values0 = vload4(0, weights_row_ptr); \ - DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ - VEC_DATA_TYPE(DATA_TYPE, 8) \ - src0 = vload8(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - src1 = vload4(0, src_row_ptr + 8); \ - \ - acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ - }) - -#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - weights_values0 = vload4(0, weights_row_ptr); \ - DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ - VEC_DATA_TYPE(DATA_TYPE, 16) \ - src0 = vload16(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - src1 = vload4(0, src_row_ptr + 16); \ - acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ - \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ - }) - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution5x5( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE, 8) - values0 = 0; - - __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))); -#endif /* defined(HAS_BIAS) */ - - vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \ - ({ \ - acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \ - acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \ - acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \ - acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \ - acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \ - acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \ - acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \ - acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \ - acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \ - acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \ - acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \ - acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \ - acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \ - acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \ - acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \ - acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \ - acc.s0 = mad(src0.s4, weights_row01, acc.s0); \ - acc.s1 = mad(src0.s5, weights_row01, acc.s1); \ - acc.s2 = mad(src0.s6, weights_row01, acc.s2); \ - acc.s3 = mad(src0.s7, weights_row01, acc.s3); \ - }) - -/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution5x5_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 values0 = 0.0f; - float4 values1 = 0.0f; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights from row0 and row1 - float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y)); - float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4); - float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y)); - float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4); - float8 src0; - - // Load values from row0 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - - // Load values from row1 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - // Load values from row2 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y)); - - // Load weights from row2 - weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y)); - weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); - - // Load values from row3 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y)); - - // Load weights from row3 - weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y)); - weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - // Load values from row4 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y)); - - // Load weights from row4 - weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y)); - weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4); - - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); - - // Load values from row5 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index))); - - values0 += bias; - values1 += bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl deleted file mode 100644 index b80d4f587e..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers_asymm.h" - -#undef CONVERT_SAT_STR -#undef CONVERT_SAT - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) - -#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -#if KERNEL_SIZE == 9 - -#if STRIDE_X == 1 -#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int8 src1 = convert_int8(vload8(0, src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 5 - -#if STRIDE_X == 1 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ - int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ - int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \ - acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 3 - -#if STRIDE_X == 1 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ - int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ - int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \ - acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int src1 = convert_int(*(src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 1 - -#if STRIDE_X == 3 -#define INPUT_VALUE extract_input_stride3 -#elif STRIDE_X == 2 -#define INPUT_VALUE extract_input_stride2 -#elif STRIDE_X == 1 -#define INPUT_VALUE extract_input_stride1 - -#else /* STRIDE_X not equals 1, 2 or 3 */ -#error "Only support strides 1, 2 and 3" -#endif /* STRIDE_X */ - -/** Extracts a 1D horizontal vector from the input tensor with stride as 1. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value) -{ - return vload8(0, input_value); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 2. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp = vload16(0, input_value); - return temp.s02468ace; -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp1 = vload16(0, input_value); - VEC_DATA_TYPE(DATA_TYPE, 16) - temp2 = vload16(0, input_value + 12); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); -} - -#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */ -#error "Only kernel sizes 1, 3, 5 and 9 are supported" -#endif /* KERNEL_SIZE */ - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 - * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 - * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3 - * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3 - * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32 - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution_quantized( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - int8 values0 = 0; - - __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0); - __global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { -#if KERNEL_SIZE == 9 - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y)); -#elif KERNEL_SIZE == 5 - CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); -#elif KERNEL_SIZE == 3 - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); -#elif KERNEL_SIZE == 1 - int weight = convert_int(*(__global DATA_TYPE *)weights_addr); - int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr)); - values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET); -#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */ - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index))); - values0 += (int8)(*bias_addr); -#endif /* defined(HAS_BIAS) */ - -#if OUTPUT_SHIFT < 0 - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); -#else // OUTPUT_SHIFT < 0 - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); -#endif // OUTPUT_SHIFT < 0 - values0 = values0 + OUTPUT_OFFSET; - - vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index f87b226a64..92a9d9c25a 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -363,12 +363,8 @@ const std::map ClKernelLibrary::_kernel_program_map = { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" }, - { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" }, - { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" }, - { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" }, - { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" }, + { "direct_convolution_nchw", "nchw/direct_convolution.cl" }, + { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, { "im2col3x3_nchw", "nchw/im2col.cl" }, { "im2col5x5_nchw", "nchw/im2col.cl" }, @@ -767,20 +763,8 @@ const std::map ClKernelLibrary::_program_source_map = #include "./cl_kernels/nchw/dequantization_layer.clembed" }, { - "nchw/direct_convolution1x1.cl", -#include "./cl_kernels/nchw/direct_convolution1x1.clembed" - }, - { - "nchw/direct_convolution3x3.cl", -#include "./cl_kernels/nchw/direct_convolution3x3.clembed" - }, - { - "nchw/direct_convolution5x5.cl", -#include "./cl_kernels/nchw/direct_convolution5x5.clembed" - }, - { - "nchw/direct_convolution_quantized.cl", -#include "./cl_kernels/nchw/direct_convolution_quantized.clembed" + "nchw/direct_convolution.cl", +#include "./cl_kernels/nchw/direct_convolution.clembed" }, { "nchw/im2col.cl", diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 5af7aa9662..ff8c2c32a0 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -122,209 +122,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, - DataType data_type, DataLayout data_layout) -{ - return gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT) - && (kernel_size <= 5) - && (conv_stride_x == 1) && (conv_stride_y == 1) - && (data_type == DataType::F32) - && (data_layout == DataLayout::NCHW); -} - -inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, - unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, - unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src) -{ - const DataType data_type = src->data_type(); - const DataLayout data_layout = src->data_layout(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); - - if(run_optimized_bifrost) - { - // Configure kernel window - switch(kernel_size) - { - case 1: - { - num_elems_read_per_iteration_x = 4; - num_elems_read_per_iteration_y = 4; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 4; - break; - } - case 3: - { - num_elems_read_per_iteration_x = 6; - num_elems_read_per_iteration_y = 5; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 3; - break; - } - case 5: - { - num_elems_read_per_iteration_x = 8; - num_elems_read_per_iteration_y = 6; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 2; - break; - } - default: - { - ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost"); - } - } - } - else - { - num_elems_read_per_iteration_y = kernel_size; - num_elems_written_per_iteration_x = 8; - num_elems_written_per_iteration_y = 1; - switch(kernel_size) - { - case 1: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 8; - break; - case 2: - num_elems_read_per_iteration_x = 16; - break; - case 3: - switch(src->element_size()) - { - case 1: - num_elems_read_per_iteration_x = 28; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - case 4: - num_elems_read_per_iteration_x = 22; - break; - default: - ARM_COMPUTE_ERROR("Invalid data size"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 3: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 10; - break; - case 2: - num_elems_read_per_iteration_x = 17; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 5: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 12; - break; - case 2: - num_elems_read_per_iteration_x = 20; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 9: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 16; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid direct convolution size"); - } - } -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target) -{ - const DataLayout data_layout = src->data_layout(); - - // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); - - if(data_layout == DataLayout::NHWC) - { - const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); - unsigned int num_rows = 1U; - if(dst->tensor_shape()[0] > 16) - { - num_rows = src->data_type() == DataType::F32 ? 2U : 4U; - } - - // Create window and update padding - Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); - return std::make_pair(Status{}, win); - } - else if(data_layout == DataLayout::NCHW) - { - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int kernel_size = weights->dimension(width_idx); - - unsigned int num_elems_read_per_iteration_x = 0; - unsigned int num_elems_read_per_iteration_y = 0; - unsigned int num_elems_written_per_iteration_x = 0; - unsigned int num_elems_written_per_iteration_y = 0; - - unsigned int conv_pad_left = conv_info.pad_left(); - unsigned int conv_pad_top = conv_info.pad_top(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, - num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, - kernel_size, conv_info, target, src); - - // Create window and update padding - bool window_changed = false; - Window win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); - - AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); - AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); - AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} - bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) { if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) @@ -370,11 +167,6 @@ bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataL } // namespace -BorderSize ClDirectConv2dKernel::border_size() const -{ - return _border_size; -} - ClDirectConv2dKernel::ClDirectConv2dKernel() { _type = CLKernelType::DIRECT; @@ -400,24 +192,49 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT const unsigned int kernel_size = weights->dimension(width_idx); const DataType data_type = src->data_type(); - const GPUTarget gpu_target = get_target(); + const GPUTarget gpu_target = get_target(); + unsigned int _num_elems_processed_per_iteration = 0; + + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, + 1, + src->data_type(), + src->quantization_info()); // Configure kernel window - auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); + Window win; + if(_data_layout == DataLayout::NHWC) + { + const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); + unsigned int num_rows = 1U; + if(dst->tensor_shape()[0] > 16) + { + num_rows = src->data_type() == DataType::F32 ? 2U : 4U; + } + + // Create window and update padding + win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + } + else if(_data_layout == DataLayout::NCHW) + { + _num_elems_processed_per_iteration = 1u; + win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); + } + + ICLKernel::configure_internal(win); std::stringstream kernel_name; CLBuildOptions build_options; if(_data_layout == DataLayout::NHWC) { - _border_size = BorderSize(); - kernel_name << "direct_convolution_nhwc"; - const unsigned int n0 = win_config.second.x().step(); - const unsigned int m0 = win_config.second.y().step(); + const unsigned int n0 = win.x().step(); + const unsigned int m0 = win.y().step(); const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; const unsigned int pad_left = conv_info.pad_left(); @@ -492,47 +309,42 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT } else { - _border_size = BorderSize(src->padding()); - - kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; - + kernel_name << "direct_convolution_nchw"; build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); + build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); + build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); + build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); + build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); + build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)); + build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y)); + build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); + build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); + build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); + build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); + build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); + build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration))); + build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration))); - const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout); - - if(run_optimized_for_bifrost) + if(is_data_type_quantized(data_type)) { - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - kernel_name << "_f32_bifrost"; - } - else - { - build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); - build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); - build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); - build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); - build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); - build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); - - kernel_name.str("direct_convolution_quantized"); - } + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DIS_QUANTIZED"); + build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); + build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); } } @@ -565,11 +377,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT } Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target) + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first); - return Status{}; } @@ -623,22 +433,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl } else { - Window win_in = window; - - win_in.adjust(Window::DimX, -_conv_info.pad_left(), true); - win_in.adjust(Window::DimY, -_conv_info.pad_top(), true); - - const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - const int conv_stride_x = std::get<0>(_conv_info.stride()); - const int conv_stride_y = std::get<1>(_conv_info.stride()); - - win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x); - win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y); - - Window slice_in = win_in.first_slice_window_3D(); - unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); add_3D_tensor_argument(idx1, weights, slice); if(biases != nullptr) @@ -653,11 +448,11 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl do { unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); + add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); } - while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); + while(window.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h index 5624f3a0a7..5681927816 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -72,15 +72,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target); + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; public: DataLayout _data_layout{}; - BorderSize _border_size{}; PadStrideInfo _conv_info{}; }; } // namespace kernels diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp index d2e4049a09..53de6fc403 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.cpp +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -83,7 +83,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target())); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo())); if(act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); -- cgit v1.2.1