From 30124354c6848c49f9740d1944d2445782255a85 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Thu, 2 Dec 2021 19:12:20 +0000 Subject: Remove padding from ClDirectConv2dKernel - Delete old NCHW ClDirectConv2d kernels. - Merge all kernels on a single file. - Removed padding from ClDirectConv2dKernel Resolves COMPMID-4721 Signed-off-by: Adnan AlSinan Change-Id: I624d218fb770e7b5f3c0acd4e85a21ae48470f55 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6779 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/core/CL/cl_kernels/nchw/direct_convolution.cl | 147 ++++++++++ .../CL/cl_kernels/nchw/direct_convolution1x1.cl | 316 --------------------- .../CL/cl_kernels/nchw/direct_convolution3x3.cl | 291 ------------------- .../CL/cl_kernels/nchw/direct_convolution5x5.cl | 313 -------------------- .../nchw/direct_convolution_quantized.cl | 308 -------------------- 5 files changed, 147 insertions(+), 1228 deletions(-) create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl delete mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution.cl b/src/core/CL/cl_kernels/nchw/direct_convolution.cl new file mode 100644 index 0000000000..866f62da95 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution.cl @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "helpers_asymm.h" + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 + * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 + * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 + * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3 + * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + const int id0 = get_global_id(0); + const int id1 = get_global_id(1); + const int id2 = get_global_id(2); + + const int x_coords = (id0 * STRIDE_X) - PAD_LEFT; + const int y_coords = (id1 * STRIDE_Y) - PAD_TOP; + + const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE); + + __global uchar *src_addr = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes); + __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w); + __global uchar *dst_addr = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z; + +#ifdef IS_QUANTIZED + int acc_value = 0; +#else /* IS_QUANTIZED */ + DATA_TYPE acc_value = 0; +#endif /* IS_QUANTIZED */ + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + for(int y = 0; y < WEI_HEIGHT; ++y) + { + for(int x = 0; x < WEI_WIDTH; ++x) + { + const int idx_x = (x_coords + x); + const int idx_y = (y_coords + y); + if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT)) + { + const int weight_offset = x + (WEI_HEIGHT * y); + const int input_offset = idx_x + SRC_WIDTH * idx_y; +#ifdef IS_QUANTIZED + int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset)); + int input = convert_int(*((__global DATA_TYPE *)src_addr + input_offset)); + acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET); +#else /* IS_QUANTIZED */ + DATA_TYPE weight = *((__global DATA_TYPE *)weights_addr + weight_offset); + DATA_TYPE input = *((__global DATA_TYPE *)src_addr + input_offset); + acc_value += input * weight; +#endif /* IS_QUANTIZED */ + } + } + } + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#ifdef IS_QUANTIZED + int bias = *((__global int *)(vector_offset(&biases, id2))); +#else /* IS_QUANTIZED */ + DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2))); +#endif /* IS_QUANTIZED */ + acc_value += bias; + +#endif /* defined(HAS_BIAS) */ + +#ifdef IS_QUANTIZED + +#if OUTPUT_SHIFT < 0 + acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1); +#else // OUTPUT_SHIFT < 0 + acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1); +#endif // OUTPUT_SHIFT < 0 + acc_value = acc_value + OUTPUT_OFFSET; +#endif /* IS_QUANTIZED */ + + *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE); +} \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl deleted file mode 100644 index 8ab2d1d4ea..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#define ADD_OP(a, b) ((a) + (b)) -#define MUL_OP(a, b) ((a) * (b)) -#define CONVERT_SAT(a, b) ((a)) - -#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 3 -#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size -#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size) -#elif STRIDE_X == 2 -#define INPUT_PIXEL(data_size) extract_input_stride2 -#elif STRIDE_X == 1 -#define INPUT_PIXEL(data_size) extract_input_stride1 -#else /* STRIDE_X not equals 1, 2 or 3 */ -#error "Only support strides 1, 2 and 3" -#endif /* STRIDE_X == 3 */ - -/** Extracts a 1D horizontal vector from the input tensor with stride as 1. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel) -{ - return vload8(0, input_pixel); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 2. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp = vload16(0, input_pixel); - return temp.s02468ace; -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 4) - temp1 = vload4(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp2 = vload4(0, input_pixel + 6); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp3 = vload4(0, input_pixel + 12); - VEC_DATA_TYPE(DATA_TYPE, 4) - temp4 = vload4(0, input_pixel + 18); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp1 = vload8(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp2 = vload8(0, input_pixel + 8); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp3 = vload8(0, input_pixel + 16); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. - * - * @param[in] input_pixel Pointer to the first pixel. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp1 = vload16(0, input_pixel); - VEC_DATA_TYPE(DATA_TYPE, 16) - temp2 = vload16(0, input_pixel + 12); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); -} - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 - * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution1x1( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); -#endif /* defined(HAS_BIAS) */ - - VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) - values = 0; - - const uint z_index = get_global_id(2); - - weights.ptr += z_index * weights_stride_w; - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr; - VEC_DATA_TYPE(DATA_TYPE, 8) - input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr); - values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel)); - src.ptr += src_stride_z; - weights.ptr += weights_stride_z; - } - -#ifdef HAS_BIAS - values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index)))); -#endif /* defined(HAS_BIAS) */ - - vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \ - ({ \ - acc.s0 = mad(src.s0, weight_value, acc.s0); \ - acc.s1 = mad(src.s1, weight_value, acc.s1); \ - acc.s2 = mad(src.s2, weight_value, acc.s2); \ - acc.s3 = mad(src.s3, weight_value, acc.s3); \ - }) - -/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases, -DHAS_BIAS must to be passed at compile - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution1x1_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 acc0 = 0.0f; - float4 acc1 = 0.0f; - float4 acc2 = 0.0f; - float4 acc3 = 0.0f; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights - float weight = *((__global float *)weights_addr); - - // Load values from row0 of input tensor - float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); - float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); - float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); - float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); - - CONVOLUTION1x1_BIFROST(acc0, src0, weight); - CONVOLUTION1x1_BIFROST(acc1, src1, weight); - CONVOLUTION1x1_BIFROST(acc2, src2, weight); - CONVOLUTION1x1_BIFROST(acc3, src3, weight); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); - - acc0.s0 += bias; - acc0.s1 += bias; - acc0.s2 += bias; - acc0.s3 += bias; - acc1.s0 += bias; - acc1.s1 += bias; - acc1.s2 += bias; - acc1.s3 += bias; - acc2.s0 += bias; - acc2.s1 += bias; - acc2.s2 += bias; - acc2.s3 += bias; - acc3.s0 += bias; - acc3.s1 += bias; - acc3.s2 += bias; - acc3.s3 += bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); - vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); - vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl deleted file mode 100644 index 811df053c4..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#define ADD_OP(a, b) ((a) + (b)) -#define MUL_OP(a, b) ((a) * (b)) -#define CONVERT_SAT(a, b) ((a)) - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 1 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X == 2 */ - -#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 3) \ - weights_values0 = vload3(0, weights_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 8) \ - src0 = vload8(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 2) \ - src1 = vload2(0, src_row_ptr + 8); \ - \ - acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ - }) - -#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 3) \ - weights_values0 = vload3(0, weights_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 16) \ - src0 = vload16(0, src_row_ptr); \ - DATA_TYPE src1 = *(src_row_ptr + 16); \ - \ - acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ - acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ - }) - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note This OpenCL kernel works with stride_x = 1 and 2 - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution3x3( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) - values0 = 0; - - __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)))); -#endif /* defined(HAS_BIAS) */ - - vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \ - ({ \ - acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \ - acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \ - acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \ - acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \ - acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \ - acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \ - acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \ - acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \ - acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \ - acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \ - acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \ - acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \ - }) - -/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note In case biases, -DHAS_BIAS must to be passed at compile - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution3x3_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 values0 = 0; - float4 values1 = 0; - float4 values2 = 0; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights - float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y)); - float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y)); - float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y)); - float4 src0; - float2 src1; - - // Load values from row0 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4); - - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0); - - // Load values from row1 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1); - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0); - - // Load values from row2 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2); - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1); - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0); - - // Load values from row3 of input tensor - src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2); - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1); - - // Row4 - src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); - src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4); - - // Accumulate - CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); - - values0 += (float4)bias; - values1 += (float4)bias; - values2 += (float4)bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); - vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl deleted file mode 100644 index 59d668f0bf..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#undef CONVERT_SAT - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if STRIDE_X == 1 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X == 2 */ - -#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - weights_values0 = vload4(0, weights_row_ptr); \ - DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ - VEC_DATA_TYPE(DATA_TYPE, 8) \ - src0 = vload8(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - src1 = vload4(0, src_row_ptr + 8); \ - \ - acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ - }) - -#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - weights_values0 = vload4(0, weights_row_ptr); \ - DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ - VEC_DATA_TYPE(DATA_TYPE, 16) \ - src0 = vload16(0, src_row_ptr); \ - VEC_DATA_TYPE(DATA_TYPE, 4) \ - src1 = vload4(0, src_row_ptr + 16); \ - acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ - \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ - acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ - }) - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution5x5( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE, 8) - values0 = 0; - - __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { - CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))); -#endif /* defined(HAS_BIAS) */ - - vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) - -#if defined(WEIGHTS_DEPTH) - -#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \ - ({ \ - acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \ - acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \ - acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \ - acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \ - acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \ - acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \ - acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \ - acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \ - acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \ - acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \ - acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \ - acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \ - acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \ - acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \ - acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \ - acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \ - acc.s0 = mad(src0.s4, weights_row01, acc.s0); \ - acc.s1 = mad(src0.s5, weights_row01, acc.s1); \ - acc.s2 = mad(src0.s6, weights_row01, acc.s2); \ - acc.s3 = mad(src0.s7, weights_row01, acc.s3); \ - }) - -/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32 - * - * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution5x5_f32_bifrost( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - // Get the kernel index - const int kernel_index = get_global_id(2); - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - float4 values0 = 0.0f; - float4 values1 = 0.0f; - - __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); - __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); - - // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor - - for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) - { - // Load the weights from row0 and row1 - float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y)); - float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4); - float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y)); - float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4); - float8 src0; - - // Load values from row0 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - - // Load values from row1 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - // Load values from row2 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y)); - - // Load weights from row2 - weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y)); - weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); - - // Load values from row3 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y)); - - // Load weights from row3 - weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y)); - weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4); - - // Accumulate - CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - // Load values from row4 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y)); - - // Load weights from row4 - weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y)); - weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4); - - CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); - CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); - - // Load values from row5 of input tensor - src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y)); - - // Accumulate - CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - - float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index))); - - values0 += bias; - values1 += bias; -#endif /* defined(HAS_BIAS) */ - - vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); - vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); -} -#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl deleted file mode 100644 index b80d4f587e..0000000000 --- a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers_asymm.h" - -#undef CONVERT_SAT_STR -#undef CONVERT_SAT - -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) - -#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) - -#if KERNEL_SIZE == 9 - -#if STRIDE_X == 1 -#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int8 src1 = convert_int8(vload8(0, src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 5 - -#if STRIDE_X == 1 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ - int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ - int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \ - acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ - int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 3 - -#if STRIDE_X == 1 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) -#elif STRIDE_X == 2 -#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) -#else /* STRIDE_X not equals 1 or 2 */ -#error "STRIDE_X larger than 2 is not supported" -#endif /* STRIDE_X */ - -#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ - int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ - int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \ - acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - }) - -#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ - ({ \ - int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ - int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ - int src1 = convert_int(*(src_row_ptr + 16)); \ - acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ - acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ - }) - -#elif KERNEL_SIZE == 1 - -#if STRIDE_X == 3 -#define INPUT_VALUE extract_input_stride3 -#elif STRIDE_X == 2 -#define INPUT_VALUE extract_input_stride2 -#elif STRIDE_X == 1 -#define INPUT_VALUE extract_input_stride1 - -#else /* STRIDE_X not equals 1, 2 or 3 */ -#error "Only support strides 1, 2 and 3" -#endif /* STRIDE_X */ - -/** Extracts a 1D horizontal vector from the input tensor with stride as 1. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value) -{ - return vload8(0, input_value); -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 2. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp = vload16(0, input_value); - return temp.s02468ace; -} - -/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. - * - * @param[in] input_value Pointer to the first value. - * - * @return extracted input values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value) -{ - VEC_DATA_TYPE(DATA_TYPE, 16) - temp1 = vload16(0, input_value); - VEC_DATA_TYPE(DATA_TYPE, 16) - temp2 = vload16(0, input_value + 12); - return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); -} - -#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */ -#error "Only kernel sizes 1, 3, 5 and 9 are supported" -#endif /* KERNEL_SIZE */ - -/** This kernel performs a direct convolution to convolve the low three dimensions. - * - * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 - * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH - * @note If biases are used then -DHAS_BIAS has to be passed at compile time - * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 - * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 - * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3 - * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3 - * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32 - * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) - * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor - * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension - */ -__kernel void direct_convolution_quantized( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights), -#ifdef HAS_BIAS - VECTOR_DECLARATION(biases), -#endif /* defined(HAS_BIAS) */ - unsigned int weights_stride_w) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - - int8 values0 = 0; - - __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0); - __global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0); - - const int kernel_index = get_global_id(2); - weights_addr += kernel_index * weights_stride_w; - - for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) - { -#if KERNEL_SIZE == 9 - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y)); - CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y)); -#elif KERNEL_SIZE == 5 - CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); - CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); -#elif KERNEL_SIZE == 3 - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); - CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); -#elif KERNEL_SIZE == 1 - int weight = convert_int(*(__global DATA_TYPE *)weights_addr); - int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr)); - values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET); -#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */ - - src_addr += src_stride_z; - weights_addr += weights_stride_z; - } - -#ifdef HAS_BIAS - Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); - __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index))); - values0 += (int8)(*bias_addr); -#endif /* defined(HAS_BIAS) */ - -#if OUTPUT_SHIFT < 0 - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); -#else // OUTPUT_SHIFT < 0 - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); -#endif // OUTPUT_SHIFT < 0 - values0 = values0 + OUTPUT_OFFSET; - - vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr); -} -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) -- cgit v1.2.1