diff options
Diffstat (limited to 'src/core/CL/cl_kernels/nchw')
26 files changed, 7682 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/nchw/batch_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl new file mode 100644 index 0000000000..89129cff3f --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) +/** Batch to space transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_nchw( + TENSOR3D_DECLARATION(input), + const int batch_id, + VECTOR_DECLARATION(block_shape), + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + + const int block_x = *((__global int *)vector_offset(&block, 0)); + const int block_y = *((__global int *)vector_offset(&block, 1)); + + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + const int w = batch_id % r; + + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) +/** Batch to space transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void batch_to_space_static_nchw( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int block_x = BLOCK_SHAPE_X; + const int block_y = BLOCK_SHAPE_Y; + + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + const int w = batch_id % r; + + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl new file mode 100644 index 0000000000..2d466661b3 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define INVSQRT_OP(a) rsqrt((a)) +#define SQCVT_SAT(a) (a) + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE) +#include "activation_float_helpers.h" + +/** Apply batch normalization. + * + * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR3D_DECLARATION(output), +#endif /* not IN_PLACE */ + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var), +#ifndef USE_DEFAULT_BETA + VECTOR_DECLARATION(beta), +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + VECTOR_DECLARATION(gamma), +#endif /* USE_DEFAULT_GAMMA */ + float epsilon) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D out = in; +#else /* IN_PLACE */ + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector var = CONVERT_TO_VECTOR_STRUCT(var); +#ifndef USE_DEFAULT_BETA + Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); +#endif /* USE_DEFAULT_GAMMA */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + denominator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + numerator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + x_bar = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = 0; + + const int current_slice = get_global_id(2); + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); + denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x)); + denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); + + // Calculate x bar and store results + numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x)); + numerator = SUB_OP(data, numerator); + x_bar = MUL_OP(numerator, denominator); + +#ifndef USE_DEFAULT_GAMMA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x)); + + res = MUL_OP(gamma_vec, x_bar); +#else /* USE_DEFAULT_GAMMA */ + // gamma is equal to 1, no need to perform multiplications + res = x_bar; +#endif /* USE_DEFAULT_GAMMA */ + +#ifndef USE_DEFAULT_BETA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x)); + // beta is not zero, hence we need to perform the addition + res = ADD_OP(res, beta_vec); +#endif /* USE_DEFAULT_BETA */ + + res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL); + + VSTORE(VEC_SIZE) + (res, 0, (__global DATA_TYPE *)out.ptr); +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl new file mode 100644 index 0000000000..57d82e1e6f --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl @@ -0,0 +1,103 @@ +/* +* Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) + +// Check valid VEC_SIZES +#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 +#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported" +#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 + +#define DIV_MOD_UINT(x, y, div_res, mod_res) \ + ({ \ + div_res = (uint)((x) * (float)(1.0f / (float)(y))); \ + uint r = div_res * (y); \ + mod_res = (x)-r; \ + }) + +/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details. + * + * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4 + * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64 + * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2 + * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1 + * K is equal to num_channels / num_groups. + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: All + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst)) +{ + uint curr_channel = 0; // channel id of input + uint batch_id = 0; // batch id + uint group_id = 0; // group id + uint channel_id = 0; // channel id within the group + + // Compute curr_channel and batch_id + DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel); + + // Compute group_id and channel_id + DIV_MOD_UINT(curr_channel, K, group_id, channel_id); + + const uint x = get_global_id(0) * VEC_SIZE; + const uint y = get_global_id(1) * 2; + const uint z = channel_id * NUM_GROUPS + group_id; + + // Load the Nx2 block + const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + + // Store blocks + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w; + VSTORE(VEC_SIZE) + (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y)); + VSTORE(VEC_SIZE) + (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y)); +} + +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl new file mode 100644 index 0000000000..b9f223fe9d --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Depth to space transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depth_to_space_nchw( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2) % r; + + const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; + const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl new file mode 100644 index 0000000000..e0203f7408 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) +/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + */ +__kernel void dequantization_layer_per_channel_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = scale[get_global_id(2)]; + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl new file mode 100644 index 0000000000..8ab2d1d4ea --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#undef CONVERT_SAT + +#define ADD_OP(a, b) ((a) + (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define CONVERT_SAT(a, b) ((a)) + +#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if STRIDE_X == 3 +#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size +#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size) +#elif STRIDE_X == 2 +#define INPUT_PIXEL(data_size) extract_input_stride2 +#elif STRIDE_X == 1 +#define INPUT_PIXEL(data_size) extract_input_stride1 +#else /* STRIDE_X not equals 1, 2 or 3 */ +#error "Only support strides 1, 2 and 3" +#endif /* STRIDE_X == 3 */ + +/** Extracts a 1D horizontal vector from the input tensor with stride as 1. + * + * @param[in] input_pixel Pointer to the first pixel. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel) +{ + return vload8(0, input_pixel); +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 2. + * + * @param[in] input_pixel Pointer to the first pixel. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + temp = vload16(0, input_pixel); + return temp.s02468ace; +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size. + * + * @param[in] input_pixel Pointer to the first pixel. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel) +{ + VEC_DATA_TYPE(DATA_TYPE, 4) + temp1 = vload4(0, input_pixel); + VEC_DATA_TYPE(DATA_TYPE, 4) + temp2 = vload4(0, input_pixel + 6); + VEC_DATA_TYPE(DATA_TYPE, 4) + temp3 = vload4(0, input_pixel + 12); + VEC_DATA_TYPE(DATA_TYPE, 4) + temp4 = vload4(0, input_pixel + 18); + return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03); +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size. + * + * @param[in] input_pixel Pointer to the first pixel. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel) +{ + VEC_DATA_TYPE(DATA_TYPE, 8) + temp1 = vload8(0, input_pixel); + VEC_DATA_TYPE(DATA_TYPE, 8) + temp2 = vload8(0, input_pixel + 8); + VEC_DATA_TYPE(DATA_TYPE, 8) + temp3 = vload8(0, input_pixel + 16); + return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25); +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. + * + * @param[in] input_pixel Pointer to the first pixel. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + temp1 = vload16(0, input_pixel); + VEC_DATA_TYPE(DATA_TYPE, 16) + temp2 = vload16(0, input_pixel + 12); + return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); +} + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 + * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution1x1( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* defined(HAS_BIAS) */ + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) + values = 0; + + const uint z_index = get_global_id(2); + + weights.ptr += z_index * weights_stride_w; + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr; + VEC_DATA_TYPE(DATA_TYPE, 8) + input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr); + values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel)); + src.ptr += src_stride_z; + weights.ptr += weights_stride_z; + } + +#ifdef HAS_BIAS + values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index)))); +#endif /* defined(HAS_BIAS) */ + + vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); +} +#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if defined(WEIGHTS_DEPTH) + +#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \ + ({ \ + acc.s0 = mad(src.s0, weight_value, acc.s0); \ + acc.s1 = mad(src.s1, weight_value, acc.s1); \ + acc.s2 = mad(src.s2, weight_value, acc.s2); \ + acc.s3 = mad(src.s3, weight_value, acc.s3); \ + }) + +/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32 + * + * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note In case biases, -DHAS_BIAS must to be passed at compile + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution1x1_f32_bifrost( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + // Get the kernel index + const int kernel_index = get_global_id(2); + + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float4 acc0 = 0.0f; + float4 acc1 = 0.0f; + float4 acc2 = 0.0f; + float4 acc3 = 0.0f; + + __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); + __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); + + for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) + { + // Load the weights + float weight = *((__global float *)weights_addr); + + // Load values from row0 of input tensor + float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); + float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); + float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); + float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); + + CONVOLUTION1x1_BIFROST(acc0, src0, weight); + CONVOLUTION1x1_BIFROST(acc1, src1, weight); + CONVOLUTION1x1_BIFROST(acc2, src2, weight); + CONVOLUTION1x1_BIFROST(acc3, src3, weight); + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + + float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); + + acc0.s0 += bias; + acc0.s1 += bias; + acc0.s2 += bias; + acc0.s3 += bias; + acc1.s0 += bias; + acc1.s1 += bias; + acc1.s2 += bias; + acc1.s3 += bias; + acc2.s0 += bias; + acc2.s1 += bias; + acc2.s2 += bias; + acc2.s3 += bias; + acc3.s0 += bias; + acc3.s1 += bias; + acc3.s2 += bias; + acc3.s3 += bias; +#endif /* defined(HAS_BIAS) */ + + vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); + vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); + vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); + vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y)); +} +#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl new file mode 100644 index 0000000000..811df053c4 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#undef CONVERT_SAT + +#define ADD_OP(a, b) ((a) + (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define CONVERT_SAT(a, b) ((a)) + +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if STRIDE_X == 1 +#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X == 2 */ + +#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 3) \ + weights_values0 = vload3(0, weights_row_ptr); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + src0 = vload8(0, src_row_ptr); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + src1 = vload2(0, src_row_ptr + 8); \ + \ + acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ + acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ + acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ + }) + +#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 3) \ + weights_values0 = vload3(0, weights_row_ptr); \ + VEC_DATA_TYPE(DATA_TYPE, 16) \ + src0 = vload16(0, src_row_ptr); \ + DATA_TYPE src1 = *(src_row_ptr + 16); \ + \ + acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ + acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ + acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ + }) + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note This OpenCL kernel works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution3x3( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8) + values0 = 0; + + __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); + __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); + + const int kernel_index = get_global_id(2); + weights_addr += kernel_index * weights_stride_w; + + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + + values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)))); +#endif /* defined(HAS_BIAS) */ + + vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr); +} +#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if defined(WEIGHTS_DEPTH) + +#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \ + ({ \ + acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \ + acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \ + acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \ + acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \ + acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \ + acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \ + acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \ + acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \ + acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \ + acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \ + acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \ + acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \ + }) + +/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32 + * + * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note In case biases, -DHAS_BIAS must to be passed at compile + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution3x3_f32_bifrost( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + // Get the kernel index + const int kernel_index = get_global_id(2); + + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float4 values0 = 0; + float4 values1 = 0; + float4 values2 = 0; + + __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); + __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); + + // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor + + for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) + { + // Load the weights + float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y)); + float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y)); + float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y)); + float4 src0; + float2 src1; + + // Load values from row0 of input tensor + src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); + src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4); + + CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0); + + // Load values from row1 of input tensor + src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); + src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4); + + // Accumulate + CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1); + CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0); + + // Load values from row2 of input tensor + src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); + src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4); + + // Accumulate + CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2); + CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1); + CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0); + + // Load values from row3 of input tensor + src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); + src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4); + + // Accumulate + CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2); + CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1); + + // Row4 + src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); + src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4); + + // Accumulate + CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2); + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + + float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index))); + + values0 += (float4)bias; + values1 += (float4)bias; + values2 += (float4)bias; +#endif /* defined(HAS_BIAS) */ + + vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); + vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); + vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y)); +} +#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl new file mode 100644 index 0000000000..59d668f0bf --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#undef CONVERT_SAT + +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if STRIDE_X == 1 +#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X == 2 */ + +#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + weights_values0 = vload4(0, weights_row_ptr); \ + DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + src0 = vload8(0, src_row_ptr); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + src1 = vload4(0, src_row_ptr + 8); \ + \ + acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ + }) + +#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + weights_values0 = vload4(0, weights_row_ptr); \ + DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 16) \ + src0 = vload16(0, src_row_ptr); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + src1 = vload4(0, src_row_ptr + 16); \ + acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \ + \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \ + acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \ + }) + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution5x5( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + VEC_DATA_TYPE(DATA_TYPE, 8) + values0 = 0; + + __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0); + __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); + + const int kernel_index = get_global_id(2); + weights_addr += kernel_index * weights_stride_w; + + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + + values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))); +#endif /* defined(HAS_BIAS) */ + + vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr); +} +#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) + +#if defined(WEIGHTS_DEPTH) + +#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \ + ({ \ + acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \ + acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \ + acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \ + acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \ + acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \ + acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \ + acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \ + acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \ + acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \ + acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \ + acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \ + acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \ + acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \ + acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \ + acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \ + acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \ + acc.s0 = mad(src0.s4, weights_row01, acc.s0); \ + acc.s1 = mad(src0.s5, weights_row01, acc.s1); \ + acc.s2 = mad(src0.s6, weights_row01, acc.s2); \ + acc.s3 = mad(src0.s7, weights_row01, acc.s3); \ + }) + +/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32 + * + * @note This OpenCL kernel works only with stride_x and stride_y equal to 1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution5x5_f32_bifrost( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + // Get the kernel index + const int kernel_index = get_global_id(2); + + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float4 values0 = 0.0f; + float4 values1 = 0.0f; + + __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w); + __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0); + + // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor + + for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d) + { + // Load the weights from row0 and row1 + float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y)); + float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4); + float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y)); + float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4); + float8 src0; + + // Load values from row0 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y)); + + // Accumulate + CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); + + // Load values from row1 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y)); + + // Accumulate + CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); + CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); + + // Load values from row2 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y)); + + // Load weights from row2 + weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y)); + weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4); + + // Accumulate + CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); + CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); + + // Load values from row3 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y)); + + // Load weights from row3 + weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y)); + weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4); + + // Accumulate + CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11); + CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); + + // Load values from row4 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y)); + + // Load weights from row4 + weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y)); + weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4); + + CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01); + CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11); + + // Load values from row5 of input tensor + src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y)); + + // Accumulate + CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01); + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + + float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index))); + + values0 += bias; + values1 += bias; +#endif /* defined(HAS_BIAS) */ + + vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y)); + vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y)); +} +#endif // defined(WEIGHTS_DEPTH) diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl new file mode 100644 index 0000000000..b80d4f587e --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" + +#undef CONVERT_SAT_STR +#undef CONVERT_SAT + +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) + +#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#if KERNEL_SIZE == 9 + +#if STRIDE_X == 1 +#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) +#elif STRIDE_X == 2 +#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X */ + +#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ + int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ + int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ + acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ + }) + +#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \ + int weights_value1 = convert_int(*(weights_row_ptr + 8)); \ + int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ + int8 src1 = convert_int8(vload8(0, src_row_ptr + 16)); \ + acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ + }) + +#elif KERNEL_SIZE == 5 + +#if STRIDE_X == 1 +#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) +#elif STRIDE_X == 2 +#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X */ + +#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ + int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ + int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ + int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \ + acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ + }) + +#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ + int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ + int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ + int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \ + acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \ + }) + +#elif KERNEL_SIZE == 3 + +#if STRIDE_X == 1 +#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) +#elif STRIDE_X == 2 +#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X */ + +#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ + int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ + int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \ + acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + }) + +#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ + ({ \ + int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ + int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ + int src1 = convert_int(*(src_row_ptr + 16)); \ + acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \ + acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \ + }) + +#elif KERNEL_SIZE == 1 + +#if STRIDE_X == 3 +#define INPUT_VALUE extract_input_stride3 +#elif STRIDE_X == 2 +#define INPUT_VALUE extract_input_stride2 +#elif STRIDE_X == 1 +#define INPUT_VALUE extract_input_stride1 + +#else /* STRIDE_X not equals 1, 2 or 3 */ +#error "Only support strides 1, 2 and 3" +#endif /* STRIDE_X */ + +/** Extracts a 1D horizontal vector from the input tensor with stride as 1. + * + * @param[in] input_value Pointer to the first value. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value) +{ + return vload8(0, input_value); +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 2. + * + * @param[in] input_value Pointer to the first value. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + temp = vload16(0, input_value); + return temp.s02468ace; +} + +/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size. + * + * @param[in] input_value Pointer to the first value. + * + * @return extracted input values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + temp1 = vload16(0, input_value); + VEC_DATA_TYPE(DATA_TYPE, 16) + temp2 = vload16(0, input_value + 12); + return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369); +} + +#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */ +#error "Only kernel sizes 1, 3, 5 and 9 are supported" +#endif /* KERNEL_SIZE */ + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 + * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH + * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 + * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 + * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3 + * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3 + * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32 + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + */ +__kernel void direct_convolution_quantized( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + int8 values0 = 0; + + __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0); + __global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0); + + const int kernel_index = get_global_id(2); + weights_addr += kernel_index * weights_stride_w; + + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { +#if KERNEL_SIZE == 9 + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y)); + CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y)); +#elif KERNEL_SIZE == 5 + CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y)); + CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y)); +#elif KERNEL_SIZE == 3 + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y)); + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y)); + CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y)); +#elif KERNEL_SIZE == 1 + int weight = convert_int(*(__global DATA_TYPE *)weights_addr); + int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr)); + values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET); +#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */ + + src_addr += src_stride_z; + weights_addr += weights_stride_z; + } + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); + __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index))); + values0 += (int8)(*bias_addr); +#endif /* defined(HAS_BIAS) */ + +#if OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_SHIFT < 0 + values0 = values0 + OUTPUT_OFFSET; + + vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr); +} +#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) diff --git a/src/core/CL/cl_kernels/nchw/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl new file mode 100644 index 0000000000..fddf918c63 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/im2col.cl @@ -0,0 +1,863 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#if defined(DATA_TYPE) && defined(ELEMENT_SIZE) + +#if ELEMENT_SIZE == 1 +#define COND_DATA_TYPE char +#elif ELEMENT_SIZE == 2 +#define COND_DATA_TYPE short +#elif ELEMENT_SIZE == 4 +#define COND_DATA_TYPE int +#else // ELEMENT_SIZE +#error "Element size not support" +#endif // ELEMENT_SIZE + +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col1x1_stridex1_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor + const uint yc = get_global_id(1); // y coordinate in the convolved tensor + const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map + const uint batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Clamp xc + // The strategy clamps at "xc" as it will be a valid value for sure + uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3); + + // Check which values are valid + const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); + + xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0)); + + // Calculate input indices + const uint xi = xc; + const uint yi = yc * STRIDE_Y; + + // Calculate output indices + +#if defined(NUM_GROUPS) + const uint xo = ch % (SRC_DEPTH / NUM_GROUPS); + const uint zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const uint xo = ch; +#endif // defined(NUM_GROUPS) + const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) + + VEC_DATA_TYPE(DATA_TYPE, 4) + data = vload4(0, (__global DATA_TYPE *)input_ptr); + + // If out-of-bound, overwrite with the first element + data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0); + + *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0; + *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1; + *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2; + *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if(xo == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; + *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; + *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f; + *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) + +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) +#if defined(DILATION_X) && defined(DILATION_Y) +/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X - PAD_LEFT; + const int yi = yc * STRIDE_Y - PAD_TOP; + + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; +#else // defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; +#endif // defined(NUM_GROUPS) + + // Linearize convolution elements + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + int y = yi + yk * DILATION_Y; + for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) + { + int x = xi + xk * DILATION_X; +#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); +#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) + { + *output_ptr = PAD_VALUE; + } + else + { + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); + } +#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + } + } + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *output_ptr = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(DILATION_X) && defined(DILATION_Y) + +/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col3x3_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X - PAD_LEFT; + const int yi = yc * STRIDE_Y - PAD_TOP; + + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 9; // 3x3 +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) + + VEC_DATA_TYPE(DATA_TYPE, 3) + row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + // Put 0 if the value is out-of-bound + int3 x = (int3)xi + (int3)(0, 1, 2); + int3 y = (int3)yi + (int3)(0, 1, 2); + + VEC_DATA_TYPE(COND_DATA_TYPE, 3) + cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); + VEC_DATA_TYPE(COND_DATA_TYPE, 3) + cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); + VEC_DATA_TYPE(COND_DATA_TYPE, 3) + cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); + + row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); + row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1); + row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); + *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; + } +#endif // HAS_BIAS +} + +/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col5x5_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X - PAD_LEFT; + const int yi = yc * STRIDE_Y - PAD_TOP; + + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 25; // 5x5 +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + // Put 0 if the value is out-of-bound + int4 x0 = (int4)xi + (int4)(0, 1, 2, 3); + int4 y0 = (int4)yi + (int4)(0, 1, 2, 3); + int x1 = xi + 4; + int y1 = yi + 4; + + // Check if we could have out-of-bounds elements in the x direction + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); + COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH); + COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) + + { + VEC_DATA_TYPE(DATA_TYPE, 4) + row00 = vload4(0, (__global DATA_TYPE *)input_ptr); + DATA_TYPE + row01 = *((__global DATA_TYPE *)input_ptr + 4); + + input_ptr += src_stride_y; + + VEC_DATA_TYPE(DATA_TYPE, 4) + row10 = vload4(0, (__global DATA_TYPE *)input_ptr); + DATA_TYPE + row11 = *((__global DATA_TYPE *)input_ptr + 4); + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0; + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1; + COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0); + COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1); + + // Replace with 0 if the value is not valid + row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); + row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); + row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); + row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, + row10.s012), + 0, (__global DATA_TYPE *)output_ptr); + vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 10 * dst_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 4) + row00 = vload4(0, (__global DATA_TYPE *)input_ptr); + DATA_TYPE + row01 = *((__global DATA_TYPE *)input_ptr + 4); + + input_ptr += src_stride_y; + + VEC_DATA_TYPE(DATA_TYPE, 4) + row10 = vload4(0, (__global DATA_TYPE *)input_ptr); + DATA_TYPE + row11 = *((__global DATA_TYPE *)input_ptr + 4); + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2; + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3; + COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2); + COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3); + + // Replace with 0 if the value is not valid + row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); + row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); + row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); + row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, + row10.s012), + 0, (__global DATA_TYPE *)output_ptr); + vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 10 * dst_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 4) + row00 = vload4(0, (__global DATA_TYPE *)input_ptr); + DATA_TYPE + row01 = *((__global DATA_TYPE *)input_ptr + 4); + + input_ptr += src_stride_y; + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + VEC_DATA_TYPE(COND_DATA_TYPE, 4) + cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition; + COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition); + + // Replace with 0 if the value is not valid + row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); + row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + vstore4(row00, 0, (__global DATA_TYPE *)output_ptr); + *((__global DATA_TYPE *)output_ptr + 4) = row01; + + output_ptr += 5 * dst_stride_x; + } + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *((__global DATA_TYPE *)output_ptr) = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) + +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col11x11_padx0_pady0_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X; + const int yi = yc * STRIDE_Y; + + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11 + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * 121; // 11x11 +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; +#else // defined(NUM_GROUPS) + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; +#endif // defined(NUM_GROUPS) + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + input_ptr += src_stride_y; + output_ptr += 11 * src_stride_x; + } + + { + VEC_DATA_TYPE(DATA_TYPE, 8) + row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); + VEC_DATA_TYPE(DATA_TYPE, 3) + row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); + + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); + vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); + + output_ptr += 11 * src_stride_x; + } + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *((__global DATA_TYPE *)output_ptr) = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) + +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. + * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. + * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_padx0_pady0_nchw( + TENSOR3D_DECLARATION(src), +#if defined(NUM_GROUPS) + TENSOR3D_DECLARATION(dst), +#else // defined(NUM_GROUPS) + IMAGE_DECLARATION(dst), +#endif // defined(NUM_GROUPS) + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X; + const int yi = yc * STRIDE_Y; + + // Calculate output indices +#if defined(NUM_GROUPS) + const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; + const int zo = ch / (SRC_DEPTH / NUM_GROUPS); +#else // defined(NUM_GROUPS) + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; +#endif // defined(NUM_GROUPS) + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; +#if defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; +#else // defined(NUM_GROUPS) + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; +#endif // defined(NUM_GROUPS) + + // Linearize convolution elements + for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y) + { + int last_x = 0; + for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE) + { + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); + VSTORE(VECTOR_SIZE) + (row, 0, output_ptr); + last_x = x; + } + // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE). + // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit. +#if WIDTH_MOD_VECTOR_SIZE == 1 + *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); +#elif WIDTH_MOD_VECTOR_SIZE > 1 + VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE) + row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); + VSTORE(WIDTH_MOD_VECTOR_SIZE) + (row, 0, output_ptr); +#endif /* WIDTH_MOD_VECTOR_SIZE */ + output_ptr += WIDTH_MOD_VECTOR_SIZE; + } /* End of loop over KERNEL_HEIGHT */ + +#ifdef HAS_BIAS +#if defined(NUM_GROUPS) + if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) +#else // defined(NUM_GROUPS) + if(ch == (SRC_DEPTH - 1)) +#endif // defined(NUM_GROUPS) + { + *output_ptr = 1.0f; + } +#endif // HAS_BIAS +} +#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl new file mode 100644 index 0000000000..0fef98e295 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define MUL_OP(x, y) ((x) * (y)) +#define ADD_OP(x, y) ((x) + (y)) +#define DIV_OP(x, y) ((x) / (y)) +#define POW_OP(x, y) pow((x), (y)) +#define SQCVT_SAT(a) (a) + +#if defined(NUM_SLICES) +/** Apply cross-map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA); + + const int current_slice = get_global_id(2); + const int left_slice = max(-(int)RADIUS, -current_slice); + const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice); + + for(int i = left_slice; i <= right_slice; i++) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i)); + acc = ADD_OP(acc, MUL_OP(values, values)); + } + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); + + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); +} +#endif /* defined(NUM_SLICES) */ + +#if defined(WIDTH_SIZE) +/** Apply in-map normalization when tensors are in the NCHW data layout format. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = 0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = SQCVT_SAT(KAPPA); + + const int current_col = get_global_id(0) << 2; + const int left_pos = max(-(int)RADIUS, -3 - current_col); + const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col); + +#if defined(IN_MAP_2D) + const int current_row = get_global_id(1); + const int first_row = max(-(int)RADIUS, -current_row); + const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row); +#endif /* defined(IN_MAP_2D) */ + +#if defined(IN_MAP_2D) + for(int j = first_row; j <= last_row; ++j) + { +#endif /* defined(IN_MAP_2D) */ + for(int i = left_pos; i <= right_pos; ++i) + { +#if defined(IN_MAP_2D) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0)); +#else /* defined(IN_MAP_2D) */ + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0)); +#endif /* defined(IN_MAP_2D) */ + acc = ADD_OP(acc, MUL_OP(values, values)); + } +#if defined(IN_MAP_2D) + } +#endif /* defined(IN_MAP_2D) */ + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); + + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(WIDTH_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl new file mode 100644 index 0000000000..23a0de76f7 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector std = CONVERT_TO_VECTOR_STRUCT(std); + + const uint current_slice = get_global_id(2) % NUM_CHANNELS; + + const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))); + const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))); + + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); + TYPE res = (data - curr_mean) / curr_std; + + VSTORE(VEC_SIZE) + (res, 0, (__global DATA_TYPE *)dst.ptr); +} +#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl new file mode 100644 index 0000000000..0f02ef6184 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define OFFSET_FLT ((float)OFFSET) +#define SCALE_FLT ((float)SCALE) + +#if defined(NUM_CHANNELS) + +/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 + * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 + * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector std = CONVERT_TO_VECTOR_STRUCT(std); + + const uint current_slice = get_global_id(2) % NUM_CHANNELS; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)))); + curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)))); + curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE)); + data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT; + + // Perform normalization + VEC_DATA_TYPE(float, VEC_SIZE) + res_flt = (data_flt - curr_mean_flt) / curr_std_flt; + + const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE); + VSTORE(VEC_SIZE) + (res_u8, 0, (__global DATA_TYPE *)dst.ptr); +} + +#endif // defined(NUM_CHANNELS) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl new file mode 100644 index 0000000000..790ddb381a --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "repeat.h" +#include "tile_helpers.h" + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(x, y) (fmax((x), (y))) +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(FP_MIXED_PRECISION) +#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n)) +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \ + CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n) +#else /* defined(FP_MIXED_PRECISION) */ +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr) +#endif /* defined(FP_MIXED_PRECISION) */ + +ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max(0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return ((end_y - start_y) * (end_x - start_x)); +} + +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +/** Performs a pooling function of pool size equal to N (NCHW) + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_MxN_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) + vdata = INITIAL_VALUE; + ACC_DATA_TYPE sdata = INITIAL_VALUE; + + // Load data + for(int y = 0; y < POOL_SIZE_Y; y++) + { + int x = 0; + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) + { + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) + data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ + vdata = POOL_OP(vdata, data0); + } + + // Leftover + for(; x < (int)POOL_SIZE_X; ++x) + { + ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ + sdata = POOL_OP(sdata, data0); + } + } + + // Reduce result + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) + reduce4 = POOL_OP(vdata.s0123, vdata.s4567); + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) + reduce2 = POOL_OP(reduce4.s01, reduce4.s23); + ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); + res = POOL_OP(res, sdata); + +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average pooling + res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; +} +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom) +{ + const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; + const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM; + + const int x = get_global_id(0) * STRIDE_X; + const int y = get_global_id(1) * STRIDE_Y; + const int z = get_global_id(2); + + //x axis: width, y axis: height, z axis: component + const uint padded_offset = input->offset_first_element_in_bytes + + x * input->stride_x + + y * input->stride_y + + z * input->stride_z; + + const uint offset_base = padded_offset + - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ + - PAD_TENSOR_TOP * input->stride_y /* top padding */ + - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */ + - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); + +#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) + *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT)); +#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + *offset_top = (uint)(offset_base / sizeof(DATA_TYPE)); +#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + + *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + + return; +} + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp32( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0)); + float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + float data0_max = POOL_OP(data0.s0, data0.s1); + float data1_max = POOL_OP(data1.s0, data1.s1); + float res = POOL_OP(data0_max, data1_max); + // Store result + *(__global float *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +} + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp16( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0)); + half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + half data0_max = POOL_OP(data0.s0, data0.s1); + half data1_max = POOL_OP(data1.s0, data1.s1); + half res = POOL_OP(data0_max, data1_max); + // Store result + *(__global half *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl new file mode 100644 index 0000000000..1440ef3ed1 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(INITIAL_VALUE) +#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#if defined(POOL_AVG) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) */ +#define POOL_OP(x, y) (max((x), (y))) +#endif /* defined(POOL_AVG) */ + +#define DIV_OP(x, y) (x * (1.f / y)) + +#if defined(POOL_L2) +#error "L2 pooling is not supported" +#endif /* defined(POOL_L2) */ + +int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max(0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return ((end_y - start_y) * (end_x - start_x)); +} + +/** Performs a pooling function of pool size equal to N (NCHW) + * + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_MxN_quantized_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + int8 vdata = INITIAL_VALUE; + int sdata = INITIAL_VALUE; + + // Load data + for(int y = 0; y < POOL_SIZE_Y; y++) + { + int x = 0; + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) + { + VEC_TYPE(8) + data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int8 data0 = convert_int8(data); + vdata = POOL_OP(vdata, data0); + } + + // Leftover + for(; x < (int)POOL_SIZE_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int data0 = convert_int(data); + sdata = POOL_OP(sdata, data0); + } + } + + // Reduce result + int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567); + int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23); + int res = POOL_OP(reduce2.s0, reduce2.s1); + res = POOL_OP(res, sdata); + +#if defined(POOL_AVG) + res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); +#endif /* defined(POOL_AVG) */ + + DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE); + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + + const float result_f32 = convert_float(result_q8); + const float input_offset = (float)OFFSET_IN1; + const float input_scale = (float)SCALE_IN1; + const float scale_out = (float)SCALE_OUT; + const float offset_out = (float)OFFSET_OUT; + const float in_f32 = (result_f32 - input_offset) * input_scale; + const float out_f32 = in_f32 / scale_out + offset_out; + result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE); + +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + + *(__global DATA_TYPE *)output.ptr = result_q8; +} +#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl new file mode 100644 index 0000000000..7524ba7b4a --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) + +/** Compute prior boxes and clip (NCHW) + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] idx Index to write to + * @param[in] center_x Center value of the x axis + * @param[in] center_y Center value of the y axis + * @param[in] box_width Prior box width + * @param[in] box_height Prior box height + * + */ +inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height) +{ + float xmin = (center_x - box_width / 2.f) / WIDTH; + float ymin = (center_y - box_height / 2.f) / HEIGHT; + float xmax = (center_x + box_width / 2.f) / WIDTH; + float ymax = (center_y + box_height / 2.f) / HEIGHT; + +#if defined(CLIP) + xmin = clamp(xmin, 0.f, 1.f); + ymin = clamp(ymin, 0.f, 1.f); + xmax = clamp(xmax, 0.f, 1.f); + ymax = clamp(ymax, 0.f, 1.f); +#endif // defined(CLIP) + + // Store result + vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0))); +} + +/** Compute prior boxes (NCHW) + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] min_size Prior box min size + * @param[in] min_idx Index of the min vector + * @param[in] idx Index to write to + * + * @return The updated index + */ +inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx) +{ + const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X; + const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y; + + float box_width = min_size; + float box_height = min_size; + calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height); + idx += 4; + + if(max_size > 0) + { + box_width = sqrt(min_size * max[min_idx]); + box_height = box_width; + calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height); + idx += 4; + } + for(unsigned int i = 0; i < aspect_ratios_size; ++i) + { + if(fabs(aspect_ratios[i] - 1.f) < 1e-6f) + { + continue; + } + box_width = min_size * sqrt(aspect_ratios[i]); + box_height = min_size * rsqrt(aspect_ratios[i]); + + calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height); + idx += 4; + } + + return idx; +} +/** Calculate prior boxes with NCHW format. + * + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] min The minimum values + * @param[in] max The maximum_values + * @param[in] aspect_ratios The aspect ratio values + * @param[in] min_size The minimum values size + * @param[in] max_size The maximum_values values size + * @param[in] aspect_ratios_size The aspect ratio values size + */ +__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size, + unsigned int aspect_ratios_size) +{ + Image out = CONVERT_TO_IMAGE_STRUCT(output); + + int idx = 0; + for(unsigned int i = 0; i < min_size; ++i) + { + idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx); + } + + // Store variances + for(int i = 0; i < (NUM_PRIORS * 4); i += 4) + { + vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1))); + } +} +#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */ diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl new file mode 100644 index 0000000000..fab88a1682 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/remap.cl @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +#ifndef DEPTH_OUT +/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_nearest_neighbour_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + + vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr); +} + +/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_bilinear_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + + vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr); +} +#endif // DEPTH_OUT
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl new file mode 100644 index 0000000000..f66b17c1a6 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) + +#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \ + ({ \ + int offset = zo / (int)SRC_DEPTH; \ + xi = xo * (int)STRIDE + offset % (int)STRIDE; \ + yi = yo * (int)STRIDE + offset / (int)STRIDE; \ + zi = zo % SRC_DEPTH; \ + }) + +/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64 + * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: All + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void reorg_layer_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst); + + int xo = get_global_id(0); + int yo = get_global_id(1); + int zo = get_global_id(2); + int xi, yi, zi; + + CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi); + + int src_offset = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z; + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset)); +} +#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl new file mode 100644 index 0000000000..63a53cc4f2 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/scale.cl @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_nearest(const float2 coord, const float2 scale) +{ +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_bilinear(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_nearest_neighbour_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + float8 transformed = transform_nearest(get_current_coords(), r); +#ifdef ALIGN_CORNERS + transformed = round(transformed); +#endif // ALIGN_CORNERS + const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE); + vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr); +} + +/** Performs an affine transformation on an image interpolating with the BILINEAR method. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_bilinear_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + const float8 tc = transform_bilinear(get_current_coords(), r); + vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr); +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl new file mode 100644 index 0000000000..946ad65c14 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" +#include "warp_helpers_quantized.h" + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Performs an affine transformation on an image interpolating with the BILINEAR method. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 + * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_bilinear_quantized_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r); + vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl new file mode 100644 index 0000000000..e162a29bb0 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32 + * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes) + * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes) + * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image + * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_nchw( + TENSOR4D_DECLARATION(input), + IMAGE_DECLARATION(paddings), + VECTOR_DECLARATION(block_shape), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int pad_left_x = *((__global int *)offset(&pad, 0, 0)); + const int pad_right_x = *((__global int *)offset(&pad, 1, 0)); + const int pad_left_y = *((__global int *)offset(&pad, 0, 1)); + const int pad_right_y = *((__global int *)offset(&pad, 1, 1)); + + int block_x = *((__global int *)vector_offset(&block, 0)); + int block_y = *((__global int *)vector_offset(&block, 1)); + + const int out_x = get_global_id(0); + const int out_y = get_global_id(1); + const int z = get_global_id(2); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN))) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - pad_left_x; + const int in_y = pos_y - pad_left_y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w)); + } +} + +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 + * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2 + * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2 + * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2 + * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_static_nchw( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + int block_x = BLOCK_SHAPE_X; + int block_y = BLOCK_SHAPE_Y; + + const int out_x = get_global_id(0); + const int out_y = get_global_id(1); + const int z = get_global_id(2); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - PAD_LEFT_X; + const int in_y = pos_y - PAD_LEFT_Y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w)); + } +} +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl new file mode 100644 index 0000000000..aea02e813b --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Space to depth transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_depth_nchw( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2) % r; + + const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; + const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id)); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl new file mode 100644 index 0000000000..723c491165 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function applies upsample on an input image. (NCHW) + * + * @attention The following variables must be passed at compile time: + * -# -DDATA_TYPE = Tensor data type. Supported data types: All + * -# -DVEC_SIZE_IN = Input vector size + * -# -DVEC_SIZE_OUT = Output vector size + * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit) + * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: All + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void upsample_layer_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN); + const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT); + src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x; + dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x; + + VEC_DATA_TYPE(DATA_TYPE, 8) + data = vload8(0, (__global DATA_TYPE *)src.ptr); + + VEC_DATA_TYPE(DATA_TYPE, 16) + data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7); + + vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr); + vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)); +#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr); + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr); +#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl new file mode 100644 index 0000000000..85eff9e6d9 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(SRC_DIM_Z) +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x2_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 4) + out0 = 0.0f; + out0.s0 = (w0.s0); + out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; + out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; + out0.s3 = (w0.s2); + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 4) + out1 = 0.0f; + out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; + out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 4) + out2 = 0.0f; + out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; + out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; + out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; + out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 4) + out3 = 0.0f; + out3.s0 = (w2.s0); + out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; + out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; + out3.s3 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the values across the channels + // 16 channels for 3x3 kernels + // 4 channels for 3x1 or 1x3 kernels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + out0.s0 = (w0.s0) / 16.f; + out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f; + out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f; + out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s5 = (w0.s2) / 4.f; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f; + out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + out5.s0 = (w2.s0) / 4.f; + out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f; + out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f; + out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s5 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the values across the channels + // 36 channels for 3x3 kernels + // 6 channels for 3x1 or 1x3 kernels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * + * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_5x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Transform the input tile + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + out0.s0 = w00.s0; + out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f; + out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f; + out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f; + out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f; + out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f; + out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f; + out0.s7 = w01; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f; + out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; + out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; + out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; + out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; + out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; + out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; + out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f; + out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; + out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; + out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; + out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; + out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; + out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; + out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f; + out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; + out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; + out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; + out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; + out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; + out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; + out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f; + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f; + out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; + out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; + out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; + out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; + out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; + out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; + out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f; + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f; + out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; + out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; + out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; + out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; + out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; + out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; + out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f; + + // Row 6 + VEC_DATA_TYPE(DATA_TYPE, 8) + out6 = 0.0f; + out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f; + out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; + out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; + out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; + out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; + out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; + out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; + out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f; + + // Row 7 + VEC_DATA_TYPE(DATA_TYPE, 8) + out7 = 0.0f; + out7.s0 = w40.s0; + out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f; + out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f; + out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f; + out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f; + out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f; + out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f; + out7.s7 = w41; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; + + // Store the values across the channels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6; + *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7; + *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5; + *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6; + *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7; + *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0; + *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1; + *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2; + *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3; + *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4; + *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5; + *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6; + *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7; + *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0; + *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1; + *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2; + *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3; + *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4; + *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5; + *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6; + *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +#endif // defined(SRC_DIM_Z) + +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_5x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x2_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) diff --git a/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl new file mode 100644 index 0000000000..8c382183c3 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl @@ -0,0 +1,1346 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \ + comm_fact.s2 = 2.5f * tmp.s3; \ + comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \ + comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \ + comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \ + \ + out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \ + out.s1 = comm_fact.s0 + comm_fact.s1; \ + out.s2 = comm_fact.s0 - comm_fact.s1; \ + out.s3 = comm_fact.s3 + comm_fact.s4; \ + out.s4 = comm_fact.s4 - comm_fact.s3; \ + out.s5 = comm_fact.s5 + comm_fact.s6; \ + out.s6 = comm_fact.s5 - comm_fact.s6; \ + out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ + }) + +#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \ + comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \ + comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \ + comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \ + out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \ + out.s1 = comm_fact.s0 - comm_fact.s1; \ + out.s2 = comm_fact.s0 + comm_fact.s1; \ + out.s3 = comm_fact.s2 - comm_fact.s3; \ + out.s4 = comm_fact.s2 + comm_fact.s3; \ + out.s5 = comm_fact.s4 - comm_fact.s5; \ + out.s6 = comm_fact.s4 + comm_fact.s5; \ + out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \ + }) + +#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); +#if defined(SRC_DEPTH) + const int z = get_global_id(2) % SRC_DEPTH; + const int b = get_global_id(2) / SRC_DEPTH; +#else /* defined(SRC_DEPTH) */ + const int z = get_global_id(2); +#endif /* defined(SRC_DEPTH) */ + + // Compute input address +#if defined(SRC_DEPTH) + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; +#endif /* defined(SRC_DEPTH) */ + + src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y); + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp0 = in_row0; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + tmp0 -= in_row2; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + DATA_TYPE out00 = tmp0.s0 - tmp0.s2; + DATA_TYPE out01 = tmp0.s1 + tmp0.s2; + DATA_TYPE out02 = tmp0.s2 - tmp0.s1; + DATA_TYPE out03 = tmp0.s1 - tmp0.s3; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp1 = in_row1 + in_row2; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp2 = in_row2 - in_row1; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp3 = in_row1 - in_row3; + + DATA_TYPE out10 = tmp1.s0 - tmp1.s2; + DATA_TYPE out11 = tmp1.s1 + tmp1.s2; + DATA_TYPE out12 = tmp1.s2 - tmp1.s1; + DATA_TYPE out13 = tmp1.s1 - tmp1.s3; + + DATA_TYPE out20 = tmp2.s0 - tmp2.s2; + DATA_TYPE out21 = tmp2.s1 + tmp2.s2; + DATA_TYPE out22 = tmp2.s2 - tmp2.s1; + DATA_TYPE out23 = tmp2.s1 - tmp2.s3; + + DATA_TYPE out30 = tmp3.s0 - tmp3.s2; + DATA_TYPE out31 = tmp3.s1 + tmp3.s2; + DATA_TYPE out32 = tmp3.s2 - tmp3.s1; + DATA_TYPE out33 = tmp3.s1 - tmp3.s3; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y; +#endif /* defined(SRC_DEPTH) */ + + *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00; + *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01; + *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02; + *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10; + *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11; + *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12; + *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13; + *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20; + *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21; + *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22; + *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23; + *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30; + *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31; + *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32; + *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); +#if defined(SRC_DEPTH) + const int z = (get_global_id(2) * 2) % SRC_DEPTH; + const int b = (get_global_id(2) * 2) / SRC_DEPTH; +#else /* defined(SRC_DEPTH) */ + const int z = get_global_id(2) * 2; +#endif /* defined(SRC_DEPTH) */ + + // Compute input address +#if defined(SRC_DEPTH) + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; +#endif /* defined(SRC_DEPTH) */ + src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y); + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + src_addr += src_stride_z; +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 4) + in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp0 = in_row0; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp4 = in_row4; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + tmp0 -= in_row2; + tmp4 -= in_row6; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + VEC_DATA_TYPE(DATA_TYPE, 2) + out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1); + VEC_DATA_TYPE(DATA_TYPE, 2) + out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3); + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp1 = in_row1 + in_row2; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp2 = in_row2 - in_row1; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp3 = in_row1 - in_row3; + + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp5 = in_row5 + in_row6; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp6 = in_row6 - in_row5; + VEC_DATA_TYPE(DATA_TYPE, 4) + tmp7 = in_row5 - in_row7; + + VEC_DATA_TYPE(DATA_TYPE, 2) + out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1); + VEC_DATA_TYPE(DATA_TYPE, 2) + out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3); + + VEC_DATA_TYPE(DATA_TYPE, 2) + out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1); + VEC_DATA_TYPE(DATA_TYPE, 2) + out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3); + + VEC_DATA_TYPE(DATA_TYPE, 2) + out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2); + VEC_DATA_TYPE(DATA_TYPE, 2) + out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1); + VEC_DATA_TYPE(DATA_TYPE, 2) + out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y; +#endif /* defined(SRC_DEPTH) */ + + vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)); + vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)); + vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)); + vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)); + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)); + vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)); + vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)); + vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)); + vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)); + vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)); + vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)); + vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)); + vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)); + vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)); + vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)); + vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); +#if defined(SRC_DEPTH) + const int z = get_global_id(2) % SRC_DEPTH; + const int b = get_global_id(2) / SRC_DEPTH; +#else /* defined(SRC_DEPTH) */ + const int z = get_global_id(2); +#endif /* defined(SRC_DEPTH) */ + + // Compute input address +#if defined(SRC_DEPTH) + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; +#endif /* defined(SRC_DEPTH) */ + + src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y); + +#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Row0 + VEC_DATA_TYPE(DATA_TYPE, 4) + d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); + VEC_DATA_TYPE(DATA_TYPE, 2) + d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y))); +#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Row0 + VEC_DATA_TYPE(DATA_TYPE, 4) + d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + DATA_TYPE out0 = 0.0f; + DATA_TYPE out1 = 0.0f; + DATA_TYPE out2 = 0.0f; + DATA_TYPE out3 = 0.0f; + DATA_TYPE out4 = 0.0f; + DATA_TYPE out5 = 0.0f; + + // Channels [0, 5]: [out00, out01, out02, out03, out04, out05] + out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0; + out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0; + out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0; + out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0; + out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0; + out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + // Row4 + VEC_DATA_TYPE(DATA_TYPE, 4) + d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + + // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4 + DATA_TYPE k0 = d41.s0; + DATA_TYPE k1 = d41.s0; + DATA_TYPE k2 = d41.s0; + DATA_TYPE k3 = d41.s0; + DATA_TYPE k4 = d41.s0; + DATA_TYPE k5 = 0.0f; + + k0 += 4.0f * d40.s0 - 5.0f * d40.s2; + k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3; + k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3; + k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2; + k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2; + k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1; + + out0 += k0; + out1 += k1; + out2 += k2; + out3 += k3; + out4 += k4; + out5 += k5; + + // Row2 + VEC_DATA_TYPE(DATA_TYPE, 4) + d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + + out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0; + out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0; + out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0; + out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0; + out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0; + out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1; +#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + // Compute destination address +#if defined(SRC_DEPTH) + __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w); +#else /* defined(SRC_DEPTH) */ + __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y); +#endif /* defined(SRC_DEPTH) */ + + uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE); + + *(dst_addr) = out0; + dst_addr += dst_plane_stride; + *(dst_addr) = out1; + dst_addr += dst_plane_stride; + *(dst_addr) = out2; + dst_addr += dst_plane_stride; + *(dst_addr) = out3; + dst_addr += dst_plane_stride; + *(dst_addr) = out4; + dst_addr += dst_plane_stride; + *(dst_addr) = out5; + dst_addr += dst_plane_stride; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + DATA_TYPE out6 = k0; + DATA_TYPE out7 = k1; + DATA_TYPE out8 = k2; + DATA_TYPE out9 = k3; + DATA_TYPE out10 = k4; + DATA_TYPE out11 = k5; + DATA_TYPE out12 = k0; + DATA_TYPE out13 = k1; + DATA_TYPE out14 = k2; + DATA_TYPE out15 = k3; + DATA_TYPE out16 = k4; + DATA_TYPE out17 = k5; + DATA_TYPE out18 = k0; + DATA_TYPE out19 = k1; + DATA_TYPE out20 = k2; + DATA_TYPE out21 = k3; + DATA_TYPE out22 = k4; + DATA_TYPE out23 = k5; + DATA_TYPE out24 = k0; + DATA_TYPE out25 = k1; + DATA_TYPE out26 = k2; + DATA_TYPE out27 = k3; + DATA_TYPE out28 = k4; + DATA_TYPE out29 = k5; + + // Row1 + VEC_DATA_TYPE(DATA_TYPE, 4) + d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + + // Row3 + VEC_DATA_TYPE(DATA_TYPE, 4) + d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + + // Compute common parts for the channels between [6, 29] + // Channels [6, 11]: [out10, out11, out12, out13, out14, out15] + // Channels [12, 17]: [out20, out21, out22, out23, out24, out25] + DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0; + DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0; + DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0; + DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3; + DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0; + DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3; + DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0; + DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3; + DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0; + DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3; + DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1; + DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1; + + // Channels [18, 23]: [out30, out31, out32, out33, out34, out35] + // Channels [24, 29]: [out40, out41, out42, out43, out44, out45] + DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0; + DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0 + DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0 + DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3; + DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0; + DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3 + DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0 + DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3; + DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0; + DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3) + DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1 + DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1; + + out6 += part0 - part1; + out12 += part0 + part1; + out7 += part2 + part3 + part4 + part5; + out8 += part2 - part3 + part4 - part5; + out13 += part2 + part3 - part4 - part5; + out14 += part2 - part3 - part4 + part5; + out9 += part6 + part7 + part8 + part9; + out10 += part6 - part7 + part8 - part9; + out15 += part6 - part7 - part8 + part9; + out16 += part6 + part7 - part8 - part9; + out11 += part10 + part11; + out17 += part10 - part11; + + out18 += part13 - part12; + out24 += part13 + part12; + out19 += part14 + part15 + part16 + part17; + out20 += part14 - part15 + part16 - part17; + out25 += part14 - part15 - part16 + part17; + out26 += part14 + part15 - part16 - part17; + out21 += part18 + part19 + part20 + part21; + out22 += part18 - part19 + part20 - part21; + out27 += part18 - part19 - part20 + part21; + out28 += part18 + part19 - part20 - part21; + out23 += part22 + part23; + out29 += part22 - part23; + + *(dst_addr) = out6; + dst_addr += dst_plane_stride; + *(dst_addr) = out7; + dst_addr += dst_plane_stride; + *(dst_addr) = out8; + dst_addr += dst_plane_stride; + *(dst_addr) = out9; + dst_addr += dst_plane_stride; + *(dst_addr) = out10; + dst_addr += dst_plane_stride; + *(dst_addr) = out11; + dst_addr += dst_plane_stride; + *(dst_addr) = out12; + dst_addr += dst_plane_stride; + *(dst_addr) = out13; + dst_addr += dst_plane_stride; + *(dst_addr) = out14; + dst_addr += dst_plane_stride; + *(dst_addr) = out15; + dst_addr += dst_plane_stride; + *(dst_addr) = out16; + dst_addr += dst_plane_stride; + *(dst_addr) = out17; + dst_addr += dst_plane_stride; + + *(dst_addr) = out18; + dst_addr += dst_plane_stride; + *(dst_addr) = out19; + dst_addr += dst_plane_stride; + *(dst_addr) = out20; + dst_addr += dst_plane_stride; + *(dst_addr) = out21; + dst_addr += dst_plane_stride; + *(dst_addr) = out22; + dst_addr += dst_plane_stride; + *(dst_addr) = out23; + dst_addr += dst_plane_stride; + *(dst_addr) = out24; + dst_addr += dst_plane_stride; + *(dst_addr) = out25; + dst_addr += dst_plane_stride; + *(dst_addr) = out26; + dst_addr += dst_plane_stride; + *(dst_addr) = out27; + dst_addr += dst_plane_stride; + *(dst_addr) = out28; + dst_addr += dst_plane_stride; + *(dst_addr) = out29; + dst_addr += dst_plane_stride; + + // Row5 + VEC_DATA_TYPE(DATA_TYPE, 4) + d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 2) + d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); + + // Channels [30, 35] + out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0; + out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0; + out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0; + out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0; + out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0; + out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1; + + *(dst_addr) = out0; + dst_addr += dst_plane_stride; + *(dst_addr) = out1; + dst_addr += dst_plane_stride; + *(dst_addr) = out2; + dst_addr += dst_plane_stride; + *(dst_addr) = out3; + dst_addr += dst_plane_stride; + *(dst_addr) = out4; + dst_addr += dst_plane_stride; + *(dst_addr) = out5; + dst_addr += dst_plane_stride; +#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); +#if defined(SRC_DEPTH) + const int z = get_global_id(2) % SRC_DEPTH; + const int b = get_global_id(2) / SRC_DEPTH; +#else /* defined(SRC_DEPTH) */ + const int z = get_global_id(2); +#endif /* defined(SRC_DEPTH) */ + + // Compute input address +#if defined(SRC_DEPTH) + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z; +#endif /* defined(SRC_DEPTH) */ + src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y); + + // Load input tile +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y))); +#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y)); + const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y)); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + // Calculate common factors for intermediate tensor + VEC_DATA_TYPE(DATA_TYPE, 8) + tmp0 = in_row0; + VEC_DATA_TYPE(DATA_TYPE, 8) + comm_fact0 = 0.0f; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4; + tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2; + + VEC_DATA_TYPE(DATA_TYPE, 8) + comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3; + VEC_DATA_TYPE(DATA_TYPE, 8) + comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6; + + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1; + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1; + + comm_fact0 = (DATA_TYPE)2.5f * in_row3; + comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5; + + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2; + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1; + + comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5; + comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6; + + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2; + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1; + const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + // Calculate output rows (reuse comm_fact0 vector) + VEC_DATA_TYPE(DATA_TYPE, 8) + out0; + + OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0); + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 8) + out1, out2, out3, out4, out5, out6, out7; + + OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0); + OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0); + OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0); + OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0); + OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0); + OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0); + OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0); +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + // Store values across the channels +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y; +#endif /* defined(SRC_DEPTH) */ + + *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0; + *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1; + *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2; + *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3; + *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4; + *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5; + *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6; + *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7; + +#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0; + *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1; + *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2; + *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3; + *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4; + *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5; + *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6; + *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7; + *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0; + *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1; + *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2; + *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3; + *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4; + *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5; + *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6; + *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7; + *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0; + *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1; + *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2; + *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3; + *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4; + *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5; + *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6; + *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7; + *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0; + *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1; + *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2; + *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3; + *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4; + *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5; + *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6; + *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7; + *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0; + *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1; + *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2; + *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3; + *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4; + *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5; + *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6; + *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7; + *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0; + *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1; + *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2; + *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3; + *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4; + *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5; + *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6; + *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7; + *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0; + *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1; + *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2; + *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3; + *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4; + *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5; + *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6; + *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7; +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} + +/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 + * + * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). + * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0). + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes, + src_stride_w, + dst_stride_w); +} +#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) diff --git a/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl new file mode 100644 index 0000000000..861ed50651 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x2_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size +#if defined(SRC_DEPTH) + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH); + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); +#else /* defined(SRC_DEPTH) */ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); +#endif /* defined(SRC_DEPTH) */ + + // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile + DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); + DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z)); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Compute the 2x1 or 1x2 output tile + // out00 = d00 + d01 + d02 + // out01 = d01 - d02 - d03 + + float out00 = d00 + d01 + d02; + float out01 = d01 - d02 - d03; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z)); + DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z)); + DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z)); + DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z)); + + DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z)); + DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z)); + DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z)); + DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z)); + + DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z)); + DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z)); + DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z)); + DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z)); + + // Compute the 2x2 output tile + float k0 = d01 + d11 + d21; + float k1 = d02 + d12 + d22; + float k2 = d11 - d21 - d31; + float k3 = d12 - d22 - d32; + + // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22 + // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23) + // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32) + // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33) + + float out00 = d10; + float out01 = -d13; + float out10 = d10; + float out11 = -d13; + + out00 += d00 + d20 + k0 + k1; + out01 += k0 - k1 - (d03 + d23); + out10 += -d20 - d30 + k2 + k3; + out11 += k2 - k3 + d23 + d33; +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + int y_in = get_global_id(1); + int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H; + int z_out = get_global_id(0); +#if defined(SRC_DEPTH) + int batch = get_global_id(2) / SRC_DEPTH; +#endif /* defined(SRC_DEPTH) */ + +#if defined(HAS_BIAS) + // Add bias + Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + + float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out))); + + out00 += (float)b; + out01 += (float)b; +#endif // defined(HAS_BIAS) + + // Get output address +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z; +#endif /* defined(SRC_DEPTH) */ + + // Store the output tile +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + const VEC_DATA_TYPE(DATA_TYPE, 2) + out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL); + *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0; + *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0, + (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(HAS_BIAS) + // Add bias + out10 += (DATA_TYPE)b; + out11 += (DATA_TYPE)b; +#endif // defined(HAS_BIAS) + vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0, + (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x4_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + // Each thread stores a 4x4/4x1 or 1x4 tile +#if defined(SRC_DEPTH) + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH); + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); +#else /* defined(SRC_DEPTH) */ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); +#endif /* defined(SRC_DEPTH) */ + + // Load the values across the channels to compose the 6x6 or 6x1 tile + DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); + DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z)); + DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z)); + DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z)); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Compute out00, out01, out02 and out03 + float out00 = d00 + d01 + d02 + d03 + d04; + float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04; + float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04; + float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z)); + DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z)); + DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z)); + DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z)); + DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z)); + DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z)); + + DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z)); + DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z)); + DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z)); + DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z)); + DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z)); + DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z)); + + DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z)); + DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z)); + DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z)); + DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z)); + DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z)); + DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z)); + + DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z)); + DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z)); + DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z)); + DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z)); + DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z)); + DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z)); + + DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z)); + DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z)); + DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z)); + DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z)); + DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z)); + DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z)); + + // Compute out00, out01, out02 and out03 + float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31; + float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31; + float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31; + float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31; + + float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44; + float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44; + + out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42; + out01 += k1 - d02 - d12 - d22 - d32 - d42; + out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42; + out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45; + + // Compute out10, out11, out12 and out13 + float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41; + float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41; + float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41; + float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41; + + k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44; + k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44; + + out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42; + out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42; + out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42; + out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45; + + // Compute out20, out21, out22 and out23 + float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41; + float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41; + float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41; + float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41; + + k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44; + k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44; + + out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42; + out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42; + out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42; + out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45; + + // Compute out30, out31, out32 and out33 + float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51; + float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51; + float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51; + float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51; + + k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54; + k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54; + + out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52; + out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52; + out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52; + out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55; +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + int y_in = get_global_id(1); + int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H; + int z_out = get_global_id(0); +#if defined(SRC_DEPTH) + int batch = get_global_id(2) / SRC_DEPTH; +#endif /* defined(SRC_DEPTH) */ + +#if defined(HAS_BIAS) + // Add bias + Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + + float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out))); + + out00 += (float)b; + out01 += (float)b; + out02 += (float)b; + out03 += (float)b; +#endif // defined(HAS_BIAS) + + // Get output address +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z; +#endif /* defined(SRC_DEPTH) */ + + // Store the output tile + const VEC_DATA_TYPE(DATA_TYPE, 4) + out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0; + *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1; + *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2; + *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + +#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(HAS_BIAS) + // Add bias + out10 += (float)b; + out11 += (float)b; + out12 += (float)b; + out13 += (float)b; + + out20 += (float)b; + out21 += (float)b; + out22 += (float)b; + out23 += (float)b; + + out30 += (float)b; + out31 += (float)b; + out32 += (float)b; + out33 += (float)b; +#endif // defined(HAS_BIAS) + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0, + (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0, + (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)); + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0, + (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} + +#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \ + ({ \ + comm_fact.s0 = d1 + d2; \ + comm_fact.s1 = d3 + d4; \ + comm_fact.s2 = d5 + d6; \ + \ + col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \ + col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \ + \ + comm_fact.s0 = d1 - d2; \ + comm_fact.s1 = d3 - d4; \ + comm_fact.s2 = d5 - d6; \ + \ + col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \ + col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \ + }) + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x4_5x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + // Each thread stores a 4x4/4x1 or 1x4 tile +#if defined(SRC_DEPTH) + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH); + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); +#else /* defined(SRC_DEPTH) */ + + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); +#endif /* defined(SRC_DEPTH) */ + + // Compute output address + int y_in = get_global_id(1); + int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H; + int z_out = get_global_id(0); +#if defined(SRC_DEPTH) + int batch = get_global_id(2) / SRC_DEPTH; +#endif /* defined(SRC_DEPTH) */ + +#if defined(SRC_DEPTH) + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w; +#else /* defined(SRC_DEPTH) */ + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z; +#endif /* defined(SRC_DEPTH) */ + + // Load the values across the channels to compose the input tile + DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z)); + DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z)); + DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z)); + DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z)); + DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z)); + DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z)); + DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z)); + DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z)); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Compute out00, out01, out02 and out03 + float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06; + float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06; + float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06; + float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07; + +#if defined(HAS_BIAS) + // Add bias + Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + + float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out))); + + out00 += (DATA_TYPE)b; + out01 += (DATA_TYPE)b; + out02 += (DATA_TYPE)b; + out03 += (DATA_TYPE)b; +#endif // defined(HAS_BIAS) + + // Store the output tile +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, + B_VAL), + VEC_DATA_TYPE(DATA_TYPE, 4)); + *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0; + *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1; + *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2; + *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3; +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), + 0, (__global DATA_TYPE *)(dst_addr)); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z)); + DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z)); + DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z)); + DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z)); + DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z)); + DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z)); + DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z)); + DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z)); + + DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z)); + DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z)); + DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z)); + DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z)); + DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z)); + DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z)); + DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z)); + DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z)); + + DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z)); + DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z)); + DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z)); + DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z)); + DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z)); + DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z)); + DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z)); + DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z)); + + DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z)); + DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z)); + DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z)); + DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z)); + DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z)); + DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z)); + DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z)); + DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z)); + + DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z)); + DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z)); + DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z)); + DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z)); + DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z)); + DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z)); + DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z)); + DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z)); + + DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z)); + DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z)); + DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z)); + DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z)); + DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z)); + DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z)); + DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z)); + DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z)); + + DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z)); + DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z)); + DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z)); + DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z)); + DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z)); + DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z)); + DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z)); + DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z)); + + // Compute the 8x4 intermediate tensor + VEC_DATA_TYPE(float, 4) + comm_fact0, comm_fact1, comm_fact2; + VEC_DATA_TYPE(float, 4) + tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7; + + COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0); + COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0); + COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0); + COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0); + COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0); + COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0); + COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0); + COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0); + + // Compute the 4x4 output tile + comm_fact0 = tmp_col1 + tmp_col2; + comm_fact1 = tmp_col3 + tmp_col4; + comm_fact2 = tmp_col5 + tmp_col6; + + VEC_DATA_TYPE(float, 4) + out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0; + VEC_DATA_TYPE(float, 4) + out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2; + + comm_fact0 = tmp_col1 - tmp_col2; + comm_fact1 = tmp_col3 - tmp_col4; + comm_fact2 = tmp_col5 - tmp_col6; + + VEC_DATA_TYPE(float, 4) + out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2; + VEC_DATA_TYPE(float, 4) + out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7; + +#if defined(HAS_BIAS) + // Add bias + Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + + float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out))); + + out_col0 += (VEC_DATA_TYPE(float, 4))b; + out_col1 += (VEC_DATA_TYPE(float, 4))b; + out_col2 += (VEC_DATA_TYPE(float, 4))b; + out_col3 += (VEC_DATA_TYPE(float, 4))b; +#endif // defined(HAS_BIAS) + + // Store the output tile + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), + VEC_DATA_TYPE(DATA_TYPE, 4)), + 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)); + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), + VEC_DATA_TYPE(DATA_TYPE, 4)), + 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), + VEC_DATA_TYPE(DATA_TYPE, 4)), + 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)); + vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), + VEC_DATA_TYPE(DATA_TYPE, 4)), + 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_5x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x2_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) +{ + winograd_output_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes +#if defined(HAS_BIAS) + , + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes +#endif // defined(HAS_BIAS) + ); +} + +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) |