diff options
Diffstat (limited to 'src/core/CL/cl_kernels/nhwc')
24 files changed, 6701 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/batch_to_space.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl new file mode 100644 index 0000000000..a5334525fe --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) +/** Batch to space transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_nhwc( + TENSOR3D_DECLARATION(input), + const int batch_id, + VECTOR_DECLARATION(block_shape), + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + + const int block_x = *((__global int *)vector_offset(&block, 0)); + const int block_y = *((__global int *)vector_offset(&block, 1)); + + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0); + const int w = batch_id % r; + + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; + + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) +/** Batch to space transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_static_nhwc( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int block_x = BLOCK_SHAPE_X; + const int block_y = BLOCK_SHAPE_Y; + + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0); + const int w = batch_id % r; + + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; + + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl new file mode 100644 index 0000000000..cb2da1bd99 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define INVSQRT_OP(a) rsqrt((a)) +#define SQCVT_SAT(a) (a) + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE) +#include "activation_float_helpers.h" + +/** Apply batch normalization on tensors with NHWC format. + * + * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR3D_DECLARATION(output), +#endif /* not IN_PLACE */ + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var), +#ifndef USE_DEFAULT_BETA + VECTOR_DECLARATION(beta), +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + VECTOR_DECLARATION(gamma), +#endif /* USE_DEFAULT_GAMMA */ + float epsilon) +{ + uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); + + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; +#ifdef IN_PLACE + __global uchar *output_addr = input_ptr; +#else /* IN_PLACE */ + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; +#endif /* IN_PLACE */ + __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; + __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs; +#ifndef USE_DEFAULT_BETA + __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs; +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs; +#endif /* USE_DEFAULT_GAMMA */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + denominator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + numerator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + x_bar = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res0 = 0; + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); + denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr); + denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); + + // Calculate x bar and store results + numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr); + numerator = SUB_OP(data, numerator); + x_bar = MUL_OP(numerator, denominator); + +#ifndef USE_DEFAULT_GAMMA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr); + + res0 = MUL_OP(gamma_vec, x_bar); +#else /* USE_DEFAULT_GAMMA */ + // gamma is equal to 1, no need to perform multiplications + res0 = x_bar; +#endif /* USE_DEFAULT_GAMMA */ + +#ifndef USE_DEFAULT_BETA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr); + // beta is not zero, hence we need to perform the addition + res0 = ADD_OP(res0, beta_vec); +#endif /* USE_DEFAULT_BETA */ + + res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL); + + STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl new file mode 100644 index 0000000000..233beb3aa9 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) + +// Check valid VEC_SIZES +#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 +#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported" +#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 + +#define DIV_MOD_UINT(x, y, div_res, mod_res) \ + ({ \ + div_res = (uint)((x) * (float)(1.0f / (float)(y))); \ + uint r = div_res * (y); \ + mod_res = (x)-r; \ + }) + +#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X) + +/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details. + * + * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4 + * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64 + * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64 + * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2 + * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1 + * K is equal to num_channels / num_groups. + * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1 + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: All + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst)) +{ + // Offset computation + const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map + + uint z = 0; + uint batch_id = 0; + // Compute curr_channel and batch_id + DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z); + + VEC_DATA_TYPE(uint, VEC_SIZE) + curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE); + + VEC_DATA_TYPE(uint, VEC_SIZE) + in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS)); + + // Load the values + const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w); + +#if VEC_SIZE == 1 + DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels); +#elif VEC_SIZE == 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + out0 = + { + *(input_ptr + in_channels.s0), + *(input_ptr + in_channels.s1) + }; +#elif VEC_SIZE == 3 + VEC_DATA_TYPE(DATA_TYPE, 3) + out0 = + { + *(input_ptr + in_channels.s0), + *(input_ptr + in_channels.s1), + *(input_ptr + in_channels.s2) + }; +#elif VEC_SIZE == 4 + VEC_DATA_TYPE(DATA_TYPE, 4) + out0 = + { + *(input_ptr + in_channels.s0), + *(input_ptr + in_channels.s1), + *(input_ptr + in_channels.s2), + *(input_ptr + in_channels.s3) + }; +#elif VEC_SIZE == 8 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = + { + *(input_ptr + in_channels.s0), + *(input_ptr + in_channels.s1), + *(input_ptr + in_channels.s2), + *(input_ptr + in_channels.s3), + *(input_ptr + in_channels.s4), + *(input_ptr + in_channels.s5), + *(input_ptr + in_channels.s6), + *(input_ptr + in_channels.s7) + }; +#elif VEC_SIZE == 16 + VEC_DATA_TYPE(DATA_TYPE, 16) + out0 = + { + *(input_ptr + in_channels.s0), + *(input_ptr + in_channels.s1), + *(input_ptr + in_channels.s2), + *(input_ptr + in_channels.s3), + *(input_ptr + in_channels.s4), + *(input_ptr + in_channels.s5), + *(input_ptr + in_channels.s6), + *(input_ptr + in_channels.s7), + *(input_ptr + in_channels.s8), + *(input_ptr + in_channels.s9), + *(input_ptr + in_channels.sa), + *(input_ptr + in_channels.sb), + *(input_ptr + in_channels.sc), + *(input_ptr + in_channels.sd), + *(input_ptr + in_channels.se), + *(input_ptr + in_channels.sf) + }; +#endif // VEC_SIZE == 1 + + __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w; + STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} +#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl new file mode 100644 index 0000000000..5464a4bef8 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Depth to space transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depth_to_space_nhwc( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0) % r; + + const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; + const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl new file mode 100644 index 0000000000..238d3a7921 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) +/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + */ +__kernel void dequantization_layer_per_channel_nhwc( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + scale -= max(xi - (int)LAST_ACCESSED_X, 0); + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = VLOAD(VEC_SIZE)(0, &scale[xi]); + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl new file mode 100644 index 0000000000..75a7a0f004 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "activation_float_helpers.h" +#include "helpers.h" +#include "helpers_asymm.h" +#include "tile_helpers.h" + +//! @cond Doxygen_Suppress +/** OpenCL kernel to compute the direct convolution. + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED + * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half) + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2) + * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) + * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64) + * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64) + * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) + * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) + * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float) + * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) + * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float) + * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1) + * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, .... n + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) + * + *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time: + * - -DIS_QUANTIZED + * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234 + * - The destination quantization shift e.g. -DDST_SHIFT=4 + * - The destination offset e.g. -DDST_OFFSET=4 + * - The source offset e.g. -DSRC_OFFSET=4 + * - The weights offset e.g. -DWEI_OFFSET=4 + * - The quantized zero value e.g. -DZERO_VALUE=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32/QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr + * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) + * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED) + * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix + */ +//! @endcond +__kernel void direct_convolution_nhwc( + TENSOR4D(src, SRC_TENSOR_TYPE), + TENSOR4D(dst, DST_TENSOR_TYPE), + TENSOR4D(wei, WEI_TENSOR_TYPE) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bia) +#endif // defined(HAS_BIAS) +) +{ + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _IWEI_WIDTH WEI_WIDTH +#define _IWEI_HEIGHT WEI_HEIGHT +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _ISRC_CHANNELS SRC_CHANNELS +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _IDST_CHANNELS DST_CHANNELS +#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) + + // If quantized, the output tile has to be quantized first before being stored to global memory +#if defined(IS_QUANTIZED) +#define _IOUTPUT_TILE cq +#else // defined(IS_QUANTIZED) +#define _IOUTPUT_TILE c +#endif // defined(IS_QUANTIZED) + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // .v = access the whole vector (OpenCL vector) + // .s[x] = access the vector element at position x (scalar access) + TILE(int, M0, 1, xi); + TILE(int, M0, 1, yi); + + // Convert the linear index to coordinate + LOOP_UNROLLING(int, i, 0, 1, M0, + { + xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X; + yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y; + xi[i].v -= PAD_LEFT; + yi[i].v -= PAD_TOP; + }) + + // Initialize the accumulators + TILE(ACC_DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) + { + int ck = 0; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_WIDTH; + + int k = 0; + for(; k <= (_ISRC_CHANNELS - K0); k += K0) + { + TILE(SRC_DATA_TYPE, M0, K0, a); + TILE(WEI_DATA_TYPE, N0, K0, b); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = ZERO_VALUE; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + + // Load tile from the weights tensor + T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); + + // Apply the offset correction (correction usually needed for asymmetric quantized computation) + // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero + T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); + + ck += K0; + } + + // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS + // This #if directive should be removed in case of dynamic tensor support +#if((SRC_CHANNELS % K0) != 0) + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) + { + TILE(SRC_DATA_TYPE, M0, 1, a); + TILE(WEI_DATA_TYPE, N0, 1, b); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = ZERO_VALUE; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); + + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); + + // Apply the offset correction (operation usually needed for asymmetric quantized computation) + // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero + T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); + + ++ck; + } +#endif // ((SRC_CHANNELS % K0) != 0) + } + + // Offset correction required for the quantized asymmetric computation + // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero + T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c); + +#if defined(HAS_BIAS) + TILE(BIA_DATA_TYPE, 1, N0, bias0); + + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + +#endif // HAS_BIAS + + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + +#if defined(IS_QUANTIZED) + + TILE(DST_DATA_TYPE, M0, N0, cq); + + // Quantize the tile + T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq); +#endif // defined(IS_QUANTIZED) + + // Apply activation + T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE); + + // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8 + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y); + +#undef _IWEI_WIDTH +#undef _IWEI_HEIGHT +#undef _ISRC_WIDTH +#undef _ISRC_HEIGHT +#undef _ISRC_CHANNELS +#undef _IDST_WIDTH +#undef _IDST_HEIGHT +#undef _IDST_CHANNELS +#undef _IY_MULTIPLIER +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl new file mode 100644 index 0000000000..d2e7e45ada --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "activation_float_helpers.h" +#include "helpers.h" +#include "helpers_asymm.h" +#include "tile_helpers.h" + +#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) +//! @cond Doxygen_Suppress +/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16) + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half) + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2) + * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2) + * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) + * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64) + * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64) + * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) + * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) + * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float) + * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) + * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float) + * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1) + * @note Only the following configurations of M0 and N0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only) + * - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) + * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr + * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) + * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED) + * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix + */ +//! @endcond +__kernel void dwc_native_fp_nhwc( + TENSOR4D(src, SRC_TENSOR_TYPE), + TENSOR4D(dst, DST_TENSOR_TYPE), + TENSOR4D(wei, WEI_TENSOR_TYPE) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bia) +#endif // defined(HAS_BIAS) +) +{ + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _IWEI_WIDTH WEI_WIDTH +#define _IWEI_HEIGHT WEI_HEIGHT +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _IDST_CHANNELS DST_CHANNELS +#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension) +#define _IN0_A N0 // Cols tile A +#define _IM0_B _IWEI_WIDTH // Rows tile B +#define _IN0_B N0 // Cols tile B +#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1))) + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int xo = GET_SPATIAL_IDX(1, M0, 0); // WIDTH +#if defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT + const int bout = 0; // BATCH SIZE IDX +#endif // defined(BATCHED_EXECUTION) + + int xi = xo * STRIDE_X; + int yi = yo * STRIDE_Y; + xi -= PAD_LEFT; + yi -= PAD_TOP; + + int d = 0; +#if DEPTH_MULTIPLIER != 1 + for(; d < DEPTH_MULTIPLIER; d++) +#endif // DEPTH_MULTIPLIER != 1 + { + TILE(ACC_DATA_TYPE, M0, N0, c); + + // Reset accumulators + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + +#if _IWEI_HEIGHT <= 5 + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = 0; + }) + + // Load tile from the src tensor (TILE A) + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); + + TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); + + // Load tile from the weights tensor (TILE B) + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b); + + // Optimized path for STRIDE_X == 1 + // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + c[m0].v += a[xk + m0].v *b[xk].v; + }) + }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 + +#if defined(HAS_BIAS) + TILE(BIA_DATA_TYPE, 1, N0, bias0); + + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); +#endif // HAS_BIAS + + T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); + + TILE(uint, M0, 1, dst_indirect_y); + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + if(x_cond) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); + VSTORE_PARTIAL(N0, PARTIAL_N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); + VSTORE(N0) + (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + } +} +#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl new file mode 100644 index 0000000000..1bc58b6e26 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" +#include "tile_helpers.h" + +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0) +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0) +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128) +#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128) + +#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \ + ({}) + +#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \ + TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \ + TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \ + T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \ + T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts); + +#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) +#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE() + +#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) +//! @cond Doxygen_Suppress +/** OpenCL kernel to compute the depthwise convolution for quantized data types + * + * @note Data layout supported: NHWC + * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2) + * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2) + * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) + * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64) + * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64) + * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) + * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8) + * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8) + * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8) + * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int) + * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1) + * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively + * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3) + * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1) + * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432) + * @note Only the following configurations of M0 and N0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only) + * - N0 = 2, 3, 4, 8, 16 + * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr + * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) + * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] dst_multipliers_ptr Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32 + * @param[in] dst_multipliers_stride_x Stride of the destination multipliers tensor in X dimension (in bytes) + * @param[in] dst_multipliers_step_x dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor + * @param[in] dst_shifts_ptr Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32 + * @param[in] dst_shifts_stride_x Stride of the destination shifts tensor in X dimension (in bytes) + * @param[in] dst_shifts_step_x dst_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_shifts_offset_first_element_in_bytes The offset of the first element in the destination shifts tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: S32 + * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + */ +//! @endcond +__kernel void dwc_native_quantized_nhwc( + TENSOR4D(src, SRC_TENSOR_TYPE), + TENSOR4D(dst, DST_TENSOR_TYPE), + TENSOR4D(wei, WEI_TENSOR_TYPE), + VECTOR_DECLARATION(dst_multipliers), + VECTOR_DECLARATION(dst_shifts) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bia) +#endif // defined(HAS_BIAS) +) +{ + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _IWEI_WIDTH WEI_WIDTH +#define _IWEI_HEIGHT WEI_HEIGHT +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _IDST_CHANNELS DST_CHANNELS +#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension) +#define _IN0_A N0 // Cols tile A +#define _IM0_B _IWEI_WIDTH // Rows tile B +#define _IN0_B N0 // Cols tile B +#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1))) + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int xo = GET_SPATIAL_IDX(1, M0, 0); // WIDTH +#if defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT + const int bout = 0; // BATCH SIZE IDX +#endif // defined(BATCHED_EXECUTION) + + int xi = xo * STRIDE_X; + int yi = yo * STRIDE_Y; + xi -= PAD_LEFT; + yi -= PAD_TOP; + + int d = 0; +#if DEPTH_MULTIPLIER != 1 + for(; d < DEPTH_MULTIPLIER; d++) +#endif // DEPTH_MULTIPLIER != 1 + { + TILE(ACC_DATA_TYPE, M0, N0, c); + + // Reset accumulators + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + +#if _IWEI_HEIGHT <= 5 + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +#else // _IWEI_HEIGHT <= 5 + for(int yk = 0; yk < _IWEI_HEIGHT; yk++) +#endif // _IWEI_HEIGHT <= 5 + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = ZERO_VALUE; + }) + + // Load tile from the src tensor (TILE A) + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); + + TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); + + // Load tile from the weights tensor (TILE B) + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b); + + // Optimized path for STRIDE_X == 1 + // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, + { +#if _IWEI_WIDTH <= 16 +#define DOT_DATA_TYPE SRC_DATA_TYPE +#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE)) + + // Optimized path for the dot instruction + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0); + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0); + ACC_DATA_TYPE offset_a = 0; + ACC_DATA_TYPE offset_b = 0; + + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + x0[0].s[xk] = a[xk + m0].s[n0]; + y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION; + }) + DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b); + c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET; +#else // _IWEI_WIDTH <= 16 + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET)); + }) +#endif // _IWEI_WIDTH <= 16 + }) + }) + } +#if _IWEI_HEIGHT <= 5 + ) +#endif // _IWEI_HEIGHT <= 5 + +#if _IWEI_WIDTH <= 16 + T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c); +#endif // _IWEI_WIDTH <= 16 + +#if defined(HAS_BIAS) + TILE(BIA_DATA_TYPE, 1, N0, bias0); + + // Load bias + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); +#endif // HAS_BIAS + + T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE); + + // Quantize the tile + TILE(DST_DATA_TYPE, M0, N0, cq); + T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq); + + // Perform activation + T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq); + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + if(x_cond) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); + VSTORE_PARTIAL(N0, PARTIAL_N0) + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1); + VSTORE(N0) + (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w)); + }) + } + } +} +#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl new file mode 100644 index 0000000000..ac00c11283 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/im2col.cl @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#if defined(DATA_TYPE) && defined(ELEMENT_SIZE) + +#if ELEMENT_SIZE == 1 +#define COND_DATA_TYPE char +#elif ELEMENT_SIZE == 2 +#define COND_DATA_TYPE short +#elif ELEMENT_SIZE == 4 +#define COND_DATA_TYPE int +#else // ELEMENT_SIZE +#error "Element size not support" +#endif // ELEMENT_SIZE + +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) + +#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) +#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE) + +/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension + * @name IM2COL1X9_NHWC_STORE + * + * @note To use this macro for a 3x3 block, @p ROW has to be 0 + * + * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16 + * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size + * @param[in] DATA_TYPE Data type of @p DATA + * @param[in] SRC_DEPTH Input channel size / depth + * @param[in] DATA Value variable base name + * @param[in] ROW The row number to store. Supported: 0-8 + * @param[in] OUTPUT_PTR Output pointer + * @{ + */ +#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE +#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + const bool at_channel_boundary = get_global_id(0) == 0; \ + if(at_channel_boundary) \ + { \ + IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + } \ + else \ + { \ + IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + } +#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE +#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) +#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE + +#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + VSTORE(VECTOR_SIZE) \ + (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); + +#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); +/** @}*/ + +/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col3x3_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int3 offset = 0; + + // Clamp xi + int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); +#if PAD_LEFT != 0 || PAD_RIGHT != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); +#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 + // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor + xi_offset *= (int3)src_stride_y; + + // Out-of-bound condition for X + int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); + + // yi == 0 + // Clamp yi + // yi_coord is casted to unsigned int in order to use just a min() operation + // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 + // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1), + // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around + // also causes y_cond (y padding condition) to be satisfied + yi_coord = yi - (int)PAD_TOP; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 1 + // Clamp yi_coord (it can be negative if PAD_TOP > 1) + yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with zeros + y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT)); + values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 2 + // Clamp yi_coord + yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT)); + values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // Store in a boundary-aware way to avoid padding + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr) + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; + } +#endif // HAS_BIAS +} + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 +#define IM2COL1x9(i) \ + ({ \ + yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ + \ + offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ + offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ + \ + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ + \ + int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \ + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \ + values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \ + values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \ + values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \ + values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \ + values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \ + values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \ + values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \ + values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \ + \ + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ + }) +#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 +#define IM2COL1x9(i) \ + ({ \ + yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ + \ + offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ + offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ + \ + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ + \ + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ + }) +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + +/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col9x9_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int8 offset0 = 0; + int offset1 = 0; + + // Clamp xi + int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT); + int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT); + +#if PAD_LEFT != 0 || PAD_RIGHT != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1)); + xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1)); +#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 + xi_offset0 *= (int8)src_stride_y; + xi_offset1 *= (int)src_stride_y; + + // Out-of-bound condition for X + int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH); + int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + IM2COL1x9(0); + IM2COL1x9(1); + IM2COL1x9(2); + IM2COL1x9(3); + IM2COL1x9(4); + IM2COL1x9(5); + IM2COL1x9(6); + IM2COL1x9(7); + IM2COL1x9(8); + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f; + } +#endif // HAS_BIAS +} + +/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int i = 0; + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + // Clamp yi_coord + int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; + yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); + + // Out-of-bound condition for Y + int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); + + for(int xk = 0; xk < KERNEL_WIDTH; ++xk) + { + // Clamp xi_coord + int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); + xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); + + // Out-of-bound condition for X + int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); + + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + // Replace with PAD_VALUE if the value is out-of-bound + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition))); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + // Store in a boundary-aware way to avoid padding +#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE + const bool at_channel_boundary = get_global_id(0) == 0; + if(at_channel_boundary) + { + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + } + else // at_channel_boundary +#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE + { + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + } + i++; + } + } + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) +#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl new file mode 100644 index 0000000000..7e35e161c8 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define MUL_OP(x, y) ((x) * (y)) +#define ADD_OP(x, y) ((x) + (y)) +#define DIV_OP(x, y) ((x) / (y)) +#define POW_OP(x, y) pow((x), (y)) +#define SQCVT_SAT(a) (a) + +#if defined(WIDTH_SIZE) +/** Apply cross-map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Offset computation + const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + + // Address computation + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = 0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = SQCVT_SAT(KAPPA); + + const int left_slice = max((int)0, (int)x_offs - (int)RADIUS); + const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS); + + for(int i = left_slice; i <= right_slice; ++i) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE))); + acc = ADD_OP(acc, MUL_OP(values, values)); + } + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized); + + STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} +#endif // defined(WIDTH_SIZE) + +#if defined(NUM_SLICES) && defined(DIM1_SIZE) +/** Apply in-map normalization when tensors are in the NHWC data layout format. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Offset computation + const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + const int current_cols = get_global_id(1); + const int current_rows = get_global_id(2); + + // Address computation + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE); + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = 0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = SQCVT_SAT(KAPPA); + + const int first_col = max(0, current_cols - (int)RADIUS); + const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS); + +#if defined(IN_MAP_2D) + const int first_row = max(0, current_rows - (int)RADIUS); + const int last_row = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS); +#endif /* defined(IN_MAP_2D) */ + +#if defined(IN_MAP_2D) + for(int j = first_row; j <= last_row; ++j) + { +#else // defined(IN_MAP_2D) + const int j = current_rows; +#endif /* defined(IN_MAP_2D) */ + for(int i = first_col; i <= last_col; ++i) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z)); + acc = ADD_OP(acc, MUL_OP(values, values)); + } +#if defined(IN_MAP_2D) + } +#endif /* defined(IN_MAP_2D) */ + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized); + + STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} +#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl new file mode 100644 index 0000000000..86c33499e2 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +/** Apply normalize_planar_yuv layer on tensors with NHWC data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; + __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; + __global uchar *std_addr = std_ptr + std_offset_first_element_in_bytes + x_offs; + + const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr); + const TYPE curr_std = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr); + + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr); + TYPE res0 = (data - curr_mean) / curr_std; + + STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} +#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl new file mode 100644 index 0000000000..7bc3c15a63 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define OFFSET_FLT ((float)OFFSET) +#define SCALE_FLT ((float)SCALE) + +/** Apply normalize_planar_yuv layer on tensors with NHWC data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 + * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; + __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; + __global uchar *std_addr = std_ptr + std_offset_first_element_in_bytes + x_offs; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE)); + curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE)); + curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE)); + data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT); + + // Perform normalization + VEC_DATA_TYPE(float, VEC_SIZE) + res_flt = (data_flt - curr_mean_flt) / curr_std_flt; + + const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE); + STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); +} +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl new file mode 100644 index 0000000000..5b59ff5088 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "repeat.h" +#include "tile_helpers.h" + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(x, y) (fmax((x), (y))) +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) + +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) +/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time + * -# l2 normalisation, -DPOOL_L2 must be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float + * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result + * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4 + * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT + * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_MxN_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side + int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + int idx_out_w = GET_SPATIAL_IDX(1, 1, 0); +#if DST_BATCH_SIZE != 1 + // If batch size != 1, the batch size dimension is collapsed over the height dimension + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT; + int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT; +#else //DST_BATCH_SIZE != 1 + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0); + int idx_out_n = 0; +#endif // DST_BATCH_SIZE != 1 + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * + output_stride_w; + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = INITIAL_VALUE; + + int idx_in_w = idx_out_w * STRIDE_X - PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; + + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + +#if defined(EXCLUDE_PADDING) + int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); +#else // defined(EXCLUDE_PADDING) + int filter_size = POOL_SIZE_X * POOL_SIZE_Y; +#endif // defined(EXCLUDE_PADDING) + +#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + // Global pooling path + for(int y = 0; y < POOL_SIZE_Y; ++y) + { +#pragma unroll 8 + for(int x = 0; x < POOL_SIZE_X; ++x) + { +#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + for(int y = pool_y_s; y < pool_y_e; ++y) + { +#pragma unroll 8 + for(int x = pool_x_s; x < pool_x_e; ++x) + { +#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + data0; +#if defined(FP_MIXED_PRECISION) + // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE + data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); +#else // defined(FP_MIXED_PRECISION) + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); +#endif // defined(FP_MIXED_PRECISION) + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif // defined(POOL_L2) + res0 = POOL_OP(res0, data0); + } + } + +#if defined(POOL_AVG) || defined(POOL_L2) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; +#endif // defined(POOL_AVG) || defined(POOL_L2) + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res0 = SQRT_OP(res0); +#endif // defined(POOL_L2) + + // Store result +#if defined(FP_MIXED_PRECISION) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#else // defined(FP_MIXED_PRECISION) + STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#endif // defined(FP_MIXED_PRECISION) +} +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + +/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time + * -# l2 normalisation, -DPOOL_L2 must be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float + * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result + * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT + * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes) + * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2x2_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output) +#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) + , + TENSOR4D_DECLARATION(indices) +#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) +) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side + int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); + int idx_out_w = get_global_id(1); +#if DST_BATCH_SIZE != 1 + // If batch size != 1, the batch size dimension is collapsed over the height dimension + int idx_out_h = get_global_id(2) % DST_HEIGHT; + int idx_out_n = get_global_id(2) / DST_HEIGHT; +#else //SRC_BATCH_SIZE != 1 + int idx_out_h = get_global_id(2); + int idx_out_n = 0; +#endif // SRC_BATCH_SIZE != 1 + + int idx_in_w = idx_out_w * STRIDE_X - PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * + output_stride_w; + + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h); + + int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s); + + int x0 = pool_x_s + idx_in_w; + int y0 = pool_y_s + idx_in_h; + int x1 = pool_x_e - 1 + idx_in_w; + int y1 = pool_y_e - 1 + idx_in_h; + + REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0); + +#if defined(FP_MIXED_PRECISION) + // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE + data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); +#else // defined(FP_MIXED_PRECISION) + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)); + data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)); + data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)); + data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)); +#endif // defined(FP_MIXED_PRECISION) + +#if !defined(POOL_MAX) + if(filter_size != 4) + { + SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0; + SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1); + SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0; + SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1); + + // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) + data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s)); + data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s)); + data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e)); + data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e)); + } +#endif // !defined(POOL_MAX) + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; + data1 *= data1; + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = data0; + res0 = POOL_OP(res0, data1); + res0 = POOL_OP(res0, data2); + res0 = POOL_OP(res0, data3); + +#if defined(POOL_AVG) || defined(POOL_L2) +#if defined(EXCLUDE_PADDING) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; +#else // !defined(EXCLUDE_PADDING) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4; +#endif // defined(EXCLUDE_PADDING) +#endif // defined(POOL_AVG) || defined(POOL_L2) + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res0 = SQRT_OP(res0); +#endif // defined(POOL_L2) + + // Store result +#if defined(FP_MIXED_PRECISION) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#else // defined(FP_MIXED_PRECISION) + STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#endif // defined(FP_MIXED_PRECISION) + +#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) + + // This part is used to return the index of the maximum value + // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor + + // note: Batch dimension does not contribute in the offset contribution + VEC_DATA_TYPE(uint, VEC_SIZE) + base_index = (uint)idx_out_c; + + base_index += VEC_OFFS(uint, VEC_SIZE); + + VEC_DATA_TYPE(uint, VEC_SIZE) + index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); + + index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE))); + index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE))); + index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE))); + + __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n * + indices_stride_w; + + // Store result + STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0)); +#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) +} +#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl new file mode 100644 index 0000000000..46268a4a88 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(INITIAL_VALUE) +#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) +#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \ + { \ + const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \ + const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \ + res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE)); \ + } +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + +#if defined(POOL_AVG) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) */ +#define POOL_OP(x, y) (max((x), (y))) +#endif /* defined(POOL_AVG) */ + +#define DIV_OP(x, y) (x * (1.f / y)) + +#if defined(POOL_L2) +#error "L2 pooling is not supported" +#endif /* defined(POOL_L2) */ + +#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) +/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int + * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4 + * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT + * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time + * + * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_MxN_quantized_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side + int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE); + int idx_out_w = get_global_id(1); +#if DST_BATCH_SIZE != 1 + // If batch size != 1, the batch size dimension is collapsed over the height dimension + int idx_out_h = get_global_id(2) % DST_HEIGHT; + int idx_out_n = get_global_id(2) / DST_HEIGHT; +#else //DST_BATCH_SIZE != 1 + int idx_out_h = get_global_id(2); + int idx_out_n = 0; +#endif // DST_BATCH_SIZE != 1 + + int idx_in_w = idx_out_w * STRIDE_X - PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w; + + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + +#if defined(POOL_AVG) && defined(EXCLUDE_PADDING) + int filter_size = 0; +#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING) + int filter_size = POOL_SIZE_X * POOL_SIZE_Y; +#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING) + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = INITIAL_VALUE; + + for(int y = pool_y_s; y < pool_y_e; ++y) + { + for(int x = pool_x_s; x < pool_x_e; ++x) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data; + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + data0; + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); + data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + + res0 = POOL_OP(res0, data0); + +#if defined(POOL_AVG) && defined(EXCLUDE_PADDING) + filter_size++; +#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING) + } + } + +#if defined(POOL_AVG) + res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size; +#endif // defined(POOL_AVG) + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + + // Store result + STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0)); +} +#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) +#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl new file mode 100644 index 0000000000..0b629fe6c9 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/remap.cl @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +#ifdef DEPTH_OUT +/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. + * Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8,F16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8,F16. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + * @param[in] border_val Value to use for border around input tensor when in CONSTANT border is selected + */ +__kernel void remap_nearest_neighbour_nhwc( + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), + TENSOR4D_DECLARATION(mapx), + TENSOR4D_DECLARATION(mapy), + const float width, + const float height +#ifdef CONSTANT_BORDER + , + const DATA_TYPE border_val +#endif // CONSTANT_BORDER +) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT); + Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT); + + float mapx_coord = (float) * (__global float *)mapx.ptr; + float mapy_coord = (float) * (__global float *)mapy.ptr; + +#ifdef CONSTANT_BORDER + if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1) + { + *((__global DATA_TYPE *)out.ptr) = border_val; + return; + } +#else // CONSTANT_BORDER + mapx_coord = clamp(mapx_coord, 0.0f, width - 1); + mapy_coord = clamp(mapy_coord, 0.0f, height - 1); +#endif // CONSTANT_BORDER + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(mapx_coord), convert_int(mapy_coord), (get_global_id(2) / DEPTH_OUT))); +} + +/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation. + * Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8,F16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8,F16. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + * @param[in] border_val Value to use for border around input tensor when in CONSTANT border is selected + */ +__kernel void remap_bilinear_nhwc( + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), + TENSOR4D_DECLARATION(mapx), + TENSOR4D_DECLARATION(mapy), + const float width, + const float height +#ifdef CONSTANT_BORDER + , + const DATA_TYPE border_val +#endif // CONSTANT_BORDER +) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT); + Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT); + + float mapx_coord = (float) * (__global float *)mapx.ptr; + float mapy_coord = (float) * (__global float *)mapy.ptr; + +#ifdef CONSTANT_BORDER + if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1) + { + *((__global DATA_TYPE *)out.ptr) = border_val; + return; + } +#endif // CONSTANT_BORDER + + const float new_xf = floor(mapx_coord); + const float new_yf = floor(mapy_coord); + const float clamped_x = clamp(new_xf, 0.0f, width - 1); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, width - 1); + const float clamped_y = clamp(new_yf, 0.0f, height - 1); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, height - 1); + + float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); + + const float a = mapx_coord - new_xf; + const float b = 1.f - a; + const float a1 = mapy_coord - new_yf; + const float b1 = 1.f - a1; + const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1)); + + *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); +} + +#endif // DEPTH_OUT
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl new file mode 100644 index 0000000000..a340b0b8a2 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) + +#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \ + ({ \ + int offset = zo / (int)SRC_DEPTH; \ + xi = xo * (int)STRIDE + offset % (int)STRIDE; \ + yi = yo * (int)STRIDE + offset / (int)STRIDE; \ + zi = zo % SRC_DEPTH; \ + }) + +/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64 + * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: All + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void reorg_layer_nhwc( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst); + + int xo = get_global_id(1); + int yo = get_global_id(2); + int zo = get_global_id(0); + int xi, yi, zi; + + CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi); + + int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset)); +} +#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl new file mode 100644 index 0000000000..1ea5e73df1 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +#if defined(DEPTH_OUT) +/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC) + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) + * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) + * @param[in] out_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_nearest_neighbour_nhwc( + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + +#ifdef SAMPLING_POLICY_TOP_LEFT + float new_x = get_global_id(1) * scale_x; + float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; +#elif SAMPLING_POLICY_CENTER + float new_x = (get_global_id(1) + 0.5f) * scale_x; + float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y; +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +#ifdef ALIGN_CORNERS + new_x = round(new_x); + new_y = round(new_y); +#endif /* ALIGN_CORNERS */ + const float clamped_x = clamp(new_x, 0.0f, input_width - 1); + const float clamped_y = clamp(new_y, 0.0f, input_height - 1); + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))); +} + +/** Performs scale on an image interpolating with the BILINEAR method. (NHWC) + * + * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) + * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) + * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + * + */ +__kernel void scale_bilinear_nhwc( + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + +#ifdef SAMPLING_POLICY_TOP_LEFT + const float new_x = get_global_id(1) * scale_x; + const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; +#elif SAMPLING_POLICY_CENTER + const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ + + const float new_xf = floor(new_x); + const float new_yf = floor(new_y); + const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); + const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); + +#ifndef BORDER_MODE_REPLICATE + const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); + const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); + const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); + const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y); + const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y1); + const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y1); + float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3); +#else /* BORDER_MODE_REPLICATE */ + float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); +#endif /* BORDER_MODE_REPLICATE */ + + const float a = new_x - new_xf; + const float b = 1.f - a; + const float a1 = new_y - new_yf; + const float b1 = 1.f - a1; + const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1)); + + *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); +} +#endif /* defined(DEPTH_OUT) */
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl new file mode 100644 index 0000000000..de9bb607b0 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" +#include "warp_helpers_quantized.h" + +#if defined(DEPTH_OUT) +/** Performs scale on an image interpolating with the BILINEAR method. (NHWC) + * + * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 + * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 + * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. + * + * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) + * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) + * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + * @param[in] constant_border_value Constant border value to use + */ +__kernel void scale_bilinear_quantized_nhwc( + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + +#ifdef SAMPLING_POLICY_TOP_LEFT + const float new_x = get_global_id(1) * scale_x; + const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; +#elif SAMPLING_POLICY_CENTER + const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ + + const float new_xf = floor(new_x); + const float new_yf = floor(new_y); + const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); + const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); + +#ifndef BORDER_MODE_REPLICATE + const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); + const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); + const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); + const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y); + const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y1); + const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y1); + int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3); +#else /* BORDER_MODE_REPLICATE */ + int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); +#endif /* BORDER_MODE_REPLICATE */ + + const float a = new_x - new_xf; + const float b = 1.f - a; + const float a1 = new_y - new_yf; + const float b1 = 1.f - a1; + const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE; + + const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1)); + + DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE); + + *((__global DATA_TYPE *)out.ptr) = res; +} +#endif /* defined(DEPTH_OUT) */
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl new file mode 100644 index 0000000000..785206e3b9 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32 + * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes) + * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes) + * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image + * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_nhwc( + TENSOR4D_DECLARATION(input), + IMAGE_DECLARATION(paddings), + VECTOR_DECLARATION(block_shape), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int pad_left_x = *((__global int *)offset(&pad, 0, 0)); + const int pad_right_x = *((__global int *)offset(&pad, 1, 0)); + const int pad_left_y = *((__global int *)offset(&pad, 0, 1)); + const int pad_right_y = *((__global int *)offset(&pad, 1, 1)); + + int block_x = *((__global int *)vector_offset(&block, 0)); + int block_y = *((__global int *)vector_offset(&block, 1)); + + const int out_x = get_global_id(1); + const int out_y = get_global_id(2); + const int z = get_global_id(0); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN))) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - pad_left_x; + const int in_y = pos_y - pad_left_y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); + } +} +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 + * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2 + * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2 + * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2 + * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_static_nhwc( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + int block_x = BLOCK_SHAPE_X; + int block_y = BLOCK_SHAPE_Y; + + const int out_x = get_global_id(1); + const int out_y = get_global_id(2); + const int z = get_global_id(0); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - PAD_LEFT_X; + const int in_y = pos_y - PAD_LEFT_Y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); + } +} +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl new file mode 100644 index 0000000000..d44e78d990 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Space to depth transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_depth_nhwc( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0) % r; + + const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; + const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id)); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl new file mode 100644 index 0000000000..74b9674a88 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function applies upsample on an input image. (NHWC) + * + * @attention The following variables must be passed at compile time: + * -# -DDATA_TYPE = Tensor data type. Supported data types: All + * -# -DVEC_SIZE_IN = Input vector size + * -# -DVEC_SIZE_OUT = Output vector size + * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit) + * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: All + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void upsample_layer_nhwc( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN); + const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT); + src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x; + dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x; + + VEC_DATA_TYPE(DATA_TYPE, 16) + data = vload16(0, (__global DATA_TYPE *)src.ptr); + + vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)); + vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)); + vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)); + vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)); +#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr); + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr); + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr); + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr); +#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl new file mode 100644 index 0000000000..8d5fd3437f --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl @@ -0,0 +1,1075 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(SRC_DIM_Z) + +#define OUTPUT_ROW_2x2_7x7(out, tmp) \ + ({ \ + out.s0 = -tmp.s0 / 36.f; \ + out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f; \ + out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f; \ + out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \ + out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \ + out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \ + out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \ + out.s7 = tmp.s6; \ + }) + +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_3x3_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w; + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y)); +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y)); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + // Row 0 + DATA_TYPE out00, out01, out02, out03, out04, out05; + out00 = (w00) / 16.f; + out01 = (-w00 - w01 - w02) / 24.f; + out02 = (-w00 + w01 - w02) / 24.f; + out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f; + out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f; + out05 = (w02) / 4.f; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + DATA_TYPE out10, out11, out12, out13, out14, out15; + out10 = (-w00 - w10 - w20) / 24.f; + out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f; + out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f; + out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f; + out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f; + out15 = (-w02 - w12 - w22) / 6.f; + + // Row 2 + DATA_TYPE out20, out21, out22, out23, out24, out25; + out20 = (-w00 + w10 - w20) / 24.f; + out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f; + out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f; + out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f; + out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f; + out25 = (-w02 + w12 - w22) / 6.f; + + // Row 3 + DATA_TYPE out30, out31, out32, out33, out34, out35; + out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f; + out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f; + out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f; + out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f; + out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f; + out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f; + + // Row 4 + DATA_TYPE out40, out41, out42, out43, out44, out45; + out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f; + out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f; + out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f; + out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f; + out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f; + out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f; + + // Row 5 + DATA_TYPE out50, out51, out52, out53, out54, out55; + out50 = (w20) / 4.f; + out51 = (-w20 - w21 - w22) / 6.f; + out52 = (-w20 + w21 - w22) / 6.f; + out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f; + out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f; + out55 = (w22); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int x0 = get_global_id(2); // idx filter + int y0 = get_global_id(0); // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; + + // Store the values across the channels + // 36 channels for 3x3 kernels + // 6 channels for 3x1 or 1x3 kernels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05; +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out10; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out11; + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out12; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out13; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_5x5_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w; + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Load the values from the input tensor + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); + DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z)); + DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z)); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Load the values from the input tensor + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y)); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + out0.s0 = w00; + out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f; + out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f; + out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f; + out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f; + out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f; + out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f; + out0.s7 = w04; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f; + out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f; + out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f; + out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f * + (w04 + w14 + w24 + w34 + w44)) / 405.f; + out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f * + (w04 + w14 + w24 + w34 + w44)) / 405.f; + out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) + + (w04 + w14 + w24 + w34 + w44)) / 810.f; + out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) + + (w04 + w14 + w24 + w34 + w44)) / 810.f; + out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f; + out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f; + out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f; + out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f * + (w04 - w14 + w24 - w34 + w44)) / 405.f; + out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f * + (w04 - w14 + w24 - w34 + w44)) / 405.f; + out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) + + (w04 - w14 + w24 - w34 + w44)) / 810.f; + out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) + + (w04 - w14 + w24 - w34 + w44)) / 810.f; + out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f; + out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + + (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f; + out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - + (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f; + out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f + * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f; + out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f + * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f; + out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f; + out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f; + out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f; + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f; + out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + + (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f; + out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - + (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f; + out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f + * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f; + out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f + * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f; + out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f; + out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f; + out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f; + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f; + out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + + (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f; + out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - + (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f; + out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f + * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f; + out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f + * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f; + out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f; + out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f; + out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f; + + // Row 6 + VEC_DATA_TYPE(DATA_TYPE, 8) + out6 = 0.0f; + out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f; + out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + + (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f; + out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - + (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f; + out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f + * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f; + out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f + * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f; + out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f; + out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f; + out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f; + + // Row 7 + VEC_DATA_TYPE(DATA_TYPE, 8) + out7 = 0.0f; + out7.s0 = w40; + out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f; + out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f; + out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f; + out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f; + out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f; + out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f; + out7.s7 = w44; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int x0 = get_global_id(2); // idx filter + int y0 = get_global_id(0); // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; + + // Store the values across the channels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6; + *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7; + *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5; + *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6; + *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7; + *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0; + *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1; + *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2; + *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3; + *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4; + *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5; + *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6; + *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7; + *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0; + *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1; + *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2; + *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3; + *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4; + *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5; + *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6; + *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} +/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x2_7x7_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w; + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Load the values from the input tensor + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); + DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z)); + DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z)); + DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z)); + DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z)); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Load the values from the input tensor + DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); + DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y)); + + DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y)); + + DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y)); + + DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y)); + + DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y)); + + DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y)); + DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y)); + DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y)); + DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y)); + DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y)); + DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y)); + DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y)); + +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + VEC_DATA_TYPE(DATA_TYPE, 8) + tmp = 0.0f; + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + + out0.s0 = -w00 / 36.0f; + out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f; + out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f; + out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f; + out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f; + out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f; + out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f; + out0.s7 = w06; + + out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + + tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f; + tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f; + tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f; + tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f; + tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f; + tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f; + tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f; + + OUTPUT_ROW_2x2_7x7(out1, tmp); + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + + tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f; + tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f; + tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f; + tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f; + tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f; + tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f; + tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f; + + OUTPUT_ROW_2x2_7x7(out2, tmp); + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + + tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f; + tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f; + tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f; + tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f; + tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f; + tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f; + tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f; + + OUTPUT_ROW_2x2_7x7(out3, tmp); + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + + tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f; + tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f; + tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f; + tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f; + tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f; + tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f; + tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f; + + OUTPUT_ROW_2x2_7x7(out4, tmp); + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + + tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f; + tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f; + tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f; + tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f; + tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f; + tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f; + tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f; + + OUTPUT_ROW_2x2_7x7(out5, tmp); + + // Row 6 + VEC_DATA_TYPE(DATA_TYPE, 8) + out6 = 0.0f; + + tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f; + tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f; + tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f; + tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f; + tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f; + tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f; + tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f; + + OUTPUT_ROW_2x2_7x7(out6, tmp); + + // Row 7 + VEC_DATA_TYPE(DATA_TYPE, 8) + out7 = 0.0f; + + tmp.s0 = w60; + tmp.s1 = w61; + tmp.s2 = w62; + tmp.s3 = w63; + tmp.s4 = w64; + tmp.s5 = w65; + tmp.s6 = w66; + + OUTPUT_ROW_2x2_7x7(out7, tmp); + +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int x0 = get_global_id(2); // idx filter + int y0 = get_global_id(0); // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; + + // Store the values across the channels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6; + *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7; + *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5; + *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6; + *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7; + *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0; + *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1; + *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2; + *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3; + *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4; + *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5; + *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6; + *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7; + *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0; + *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1; + *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2; + *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3; + *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4; + *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5; + *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6; + *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} +#endif // defined(SRC_DIM_Z) + +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_3x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_5x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x1_7x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x3_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x5_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x2_1x7_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl new file mode 100644 index 0000000000..4865982a55 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl @@ -0,0 +1,953 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \ + comm_fact.s2 = 2.5f * tmp.s3; \ + comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \ + comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \ + comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \ + \ + out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \ + out.s1 = comm_fact.s0 + comm_fact.s1; \ + out.s2 = comm_fact.s0 - comm_fact.s1; \ + out.s3 = comm_fact.s3 + comm_fact.s4; \ + out.s4 = comm_fact.s4 - comm_fact.s3; \ + out.s5 = comm_fact.s5 + comm_fact.s6; \ + out.s6 = comm_fact.s5 - comm_fact.s6; \ + out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ + }) + +#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \ + comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \ + comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \ + comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \ + out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \ + out.s1 = comm_fact.s0 - comm_fact.s1; \ + out.s2 = comm_fact.s0 + comm_fact.s1; \ + out.s3 = comm_fact.s2 - comm_fact.s3; \ + out.s4 = comm_fact.s2 + comm_fact.s3; \ + out.s5 = comm_fact.s4 - comm_fact.s5; \ + out.s6 = comm_fact.s4 + comm_fact.s5; \ + out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \ + }) + +#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) + +#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 6, 1, in); + TILE(DATA_TYPE, 6, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + TILE(DATA_TYPE, 6, 1, com); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v *= 4.0f; + }) + + com[0].v = in[2].v - 4.f * in[0].v; + com[1].v = in[3].v - 4.f * in[1].v; + com[2].v = in[4].v - 4.f * in[2].v; + com[3].v = in[5].v - 4.f * in[3].v; + com[4].v = in[3].v - in[1].v; + com[4].v = com[4].v + com[4].v; + com[5].v = in[4].v - in[2].v; + + out[0].v = com[2].v - com[0].v; + out[1].v = com[2].v + com[1].v; + out[2].v = com[2].v - com[1].v; + out[3].v = com[5].v + com[4].v; + out[4].v = com[5].v - com[4].v; + out[5].v = com[3].v - com[1].v; + + TILE(uint, 6, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 36, 1, in); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 36, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 6, 1, com); + TILE(DATA_TYPE, 36, 1, tmp); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v; + com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v; + com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v; + com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v; + com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v; + com[4].v = com[4].v + com[4].v; + com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v; + tmp[i + 0 * 6].v = com[2].v - com[0].v; + tmp[i + 1 * 6].v = com[2].v + com[1].v; + tmp[i + 2 * 6].v = com[2].v - com[1].v; + tmp[i + 3 * 6].v = com[5].v + com[4].v; + tmp[i + 4 * 6].v = com[5].v - com[4].v; + tmp[i + 5 * 6].v = com[3].v - com[1].v; + }) + + TILE(DATA_TYPE, 36, 1, out); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v; + com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v; + com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v; + com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v; + com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v; + com[4].v = com[4].v + com[4].v; + com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v; + out[i * 6 + 0].v = com[2].v - com[0].v; + out[i * 6 + 1].v = com[2].v + com[1].v; + out[i * 6 + 2].v = com[2].v - com[1].v; + out[i * 6 + 3].v = com[5].v + com[4].v; + out[i * 6 + 4].v = com[5].v - com[4].v; + out[i * 6 + 5].v = com[3].v - com[1].v; + }) + + // Compute destination address + TILE(uint, 36, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 36, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + TILE(DATA_TYPE, 1, 8, com); + + com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v; + com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v; + com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v; + com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v; + com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; + com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v; + out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v; + out[1].s[0] = com[0].s[0] + com[0].s[1]; + out[2].s[0] = com[0].s[0] - com[0].s[1]; + out[3].s[0] = com[0].s[3] + com[0].s[2]; + out[4].s[0] = com[0].s[3] - com[0].s[2]; + out[5].s[0] = com[0].s[4] + com[0].s[5]; + out[6].s[0] = com[0].s[4] - com[0].s[5]; + out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v; + + TILE(uint, 8, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 8, 8, com); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x + com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x + com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x + com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x + com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0]; + com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0]; + com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0]; + }) + + TILE(DATA_TYPE, 8, 8, tmp); + tmp[0].v = com[6].v; + tmp[1].v = com[0].v + com[1].v; + tmp[2].v = com[0].v - com[1].v; + tmp[3].v = com[2].v + com[3].v; + tmp[4].v = com[2].v - com[3].v; + tmp[5].v = com[4].v + com[5].v; + tmp[6].v = com[4].v - com[5].v; + tmp[7].v = com[7].v; + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5]; + com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; + com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5]; + out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6]; + out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1]; + out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1]; + out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2]; + out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2]; + out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5]; + out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5]; + out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7]; + }) + + TILE(uint, 64, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v *= (DATA_TYPE) - 36.0f; + }) + + TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } }; + + com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v; + com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v; + com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v; + com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v; + com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; + com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v; + out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v; + out[1].s[0] = com[0].s[0] - com[0].s[1]; + out[2].s[0] = com[0].s[0] + com[0].s[1]; + out[3].s[0] = com[0].s[2] - com[0].s[3]; + out[4].s[0] = com[0].s[2] + com[0].s[3]; + out[5].s[0] = com[0].s[4] - com[0].s[5]; + out[6].s[0] = com[0].s[4] + com[0].s[5]; + out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v; + + TILE(uint, 8, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 8, 8, com); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; + com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; + com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0]; + com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0]; + com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0]; + }) + + TILE(DATA_TYPE, 8, 8, tmp); + tmp[0].v = com[6].v; + tmp[1].v = com[0].v - com[1].v; + tmp[2].v = com[0].v + com[1].v; + tmp[3].v = com[2].v - com[3].v; + tmp[4].v = com[2].v + com[3].v; + tmp[5].v = com[4].v - com[5].v; + tmp[6].v = com[4].v + com[5].v; + tmp[7].v = com[7].v; + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5]; + com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; + com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5]; + out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6]; + out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1]; + out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1]; + out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3]; + out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3]; + out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5]; + out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5]; + out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7]; + }) + + TILE(uint, 64, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} +#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) +#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl new file mode 100644 index 0000000000..0fcd04e713 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x2_7x7_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _INUM_TILES_X NUM_TILES_X + + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + TILE(DATA_TYPE, 8, N0, in); + TILE(DATA_TYPE, 2, N0, out); + TILE(uint, 8, 1, src_indirect_y); + + // Calculate the indirect Y for the source tensor + LOOP_UNROLLING(int, i, 0, 1, 8, + { + src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + + // Load the values across the 8 channels to compose the 8x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // Compute out0 and out01 + out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v; + out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v; + +#if defined(HAS_BIAS) + // Add bias + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out); +#endif // defined(HAS_BIAS) + + T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 2, 1, dst_indirect_y); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 2, + { + int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 2, + { + int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(DATA_TYPE, 16, N0, tmp); + TILE(uint, 64, 1, src_indirect_y); + + // Calculate the indirect Y for the source tensor + LOOP_UNROLLING(int, i, 0, 1, 64, + { + src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the values across the 64 channels to compose the 8x8 tile + T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v; + tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v; + }) + + // Compute the 2x2 output tile + LOOP_UNROLLING(int, i, 0, 1, 2, + { + out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v; + out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v; + }) + +#if defined(HAS_BIAS) + // Add bias + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // defined(HAS_BIAS) + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 2, + { + LOOP_UNROLLING(int, xk, 0, 1, 2, + { + int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH; + dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] dst_size Size of the destination tensor, minus the last padding + */ +__kernel void winograd_output_transform_4x4_3x3_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 6, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(uint, 6, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v = 0; + }) + + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // Compute out00, out01, out02 and out03 + out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v; + out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v; + out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v; + out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v; + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; + dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; + dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Calculate the indirect Y for the source tensor + TILE(DATA_TYPE, 36, N0, in); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 36, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 36, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 36, + { + in[i].v = 0; + }) + + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + tmp[0].v = in[6 + i].v + in[12 + i].v; + tmp[1].v = in[6 + i].v - in[12 + i].v; + tmp[2].v = in[18 + i].v + in[24 + i].v; + tmp[3].v = in[18 + i].v - in[24 + i].v; + tmp[3].v = tmp[3].v + tmp[3].v; + in[i].v = in[i].v + tmp[0].v + tmp[2].v; + in[6 + i].v = tmp[3].v + tmp[1].v; + in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); + in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v; + }) + + // Compute the output tile + TILE(DATA_TYPE, 16, N0, out); + + LOOP_UNROLLING(int, i, 0, 1, 4, + { + tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v; + tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v; + tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v; + tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v; + tmp[3].v = tmp[3].v + tmp[3].v; + out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v; + out[4 * i + 1].v = tmp[3].v + tmp[1].v; + out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); + out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v; + }) + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 16, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; + dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x4_5x5_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + TILE(DATA_TYPE, 8, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 8, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + + // "in" contains 1x8 or 8x1 tile here + T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // A^T * in, and in this degenerate case out consists of 1 column/row + tmp[0].v = in[1].v - in[2].v; + tmp[1].v = 2.0f * (in[3].v - in[4].v); + tmp[2].v = 2.0f * (in[5].v + in[6].v); + tmp[3].v = in[3].v + in[4].v; + out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v; + out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v); + out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v; + out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v; + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; + dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; + dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Calculate the indirect Y for the source tensor + TILE(DATA_TYPE, 64, N0, in); + TILE(DATA_TYPE, 6, N0, tmp); + TILE(uint, 64, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // "in" here is 8x8 tile + T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // A^T * in + LOOP_UNROLLING(int, i, 0, 1, 8, + { + tmp[0].v = in[8 + i].v + in[16 + i].v; + tmp[1].v = in[8 + i].v - in[16 + i].v; + tmp[2].v = in[24 + i].v + in[32 + i].v; + tmp[3].v = in[24 + i].v - in[32 + i].v; + tmp[3].v = tmp[3].v + tmp[3].v; + tmp[4].v = in[40 + i].v + in[48 + i].v; + tmp[4].v = tmp[4].v + tmp[4].v; + tmp[5].v = in[40 + i].v - in[48 + i].v; + + // 4x8 matrix as a result + in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); + in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); + in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v); + in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v; + }) + + // Compute the output tile + TILE(DATA_TYPE, 16, N0, out); + + // in * A, with in = A^T * in as above + LOOP_UNROLLING(int, i, 0, 1, 4, + { + tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v; + tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v; + tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v; + tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v; + tmp[3].v = tmp[3].v + tmp[3].v; + tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v; + tmp[4].v = tmp[4].v + tmp[4].v; + tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v; + + // 4x4 tile + out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); + out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); + out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v; + out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v; + }) + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 16, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; + dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x1_7x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_3x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_5x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x2_1x7_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x3_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x5_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file |