aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/nchw
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/nchw')
-rw-r--r--src/core/CL/cl_kernels/nchw/batch_to_space.cl131
-rw-r--r--src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl147
-rw-r--r--src/core/CL/cl_kernels/nchw/channel_shuffle.cl103
-rw-r--r--src/core/CL/cl_kernels/nchw/depth_to_space.cl69
-rw-r--r--src/core/CL/cl_kernels/nchw/dequantization_layer.cl86
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl316
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl291
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl313
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl308
-rw-r--r--src/core/CL/cl_kernels/nchw/im2col.cl863
-rw-r--r--src/core/CL/cl_kernels/nchw/normalization_layer.cl175
-rw-r--r--src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl82
-rw-r--r--src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl101
-rw-r--r--src/core/CL/cl_kernels/nchw/pooling_layer.cl331
-rw-r--r--src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl142
-rw-r--r--src/core/CL/cl_kernels/nchw/prior_box_layer.cl139
-rw-r--r--src/core/CL/cl_kernels/nchw/remap.cl133
-rw-r--r--src/core/CL/cl_kernels/nchw/reorg_layer.cl75
-rw-r--r--src/core/CL/cl_kernels/nchw/scale.cl148
-rw-r--r--src/core/CL/cl_kernels/nchw/scale_quantized.cl86
-rw-r--r--src/core/CL/cl_kernels/nchw/space_to_batch.cl156
-rw-r--r--src/core/CL/cl_kernels/nchw/space_to_depth.cl69
-rw-r--r--src/core/CL/cl_kernels/nchw/upsample_layer.cl79
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl911
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_input_transform.cl1346
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_output_transform.cl1082
26 files changed, 7682 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/nchw/batch_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
new file mode 100644
index 0000000000..89129cff3f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ VECTOR_DECLARATION(block_shape),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D out = in;
+#else /* IN_PLACE */
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res = 0;
+
+ const int current_slice = get_global_id(2);
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+ // Calculate x bar and store results
+ numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+ numerator = SUB_OP(data, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+ res = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+ // beta is not zero, hence we need to perform the addition
+ res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+ res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..57d82e1e6f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res) \
+ ({ \
+ div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+ uint r = div_res * (y); \
+ mod_res = (x)-r; \
+ })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: All
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ uint curr_channel = 0; // channel id of input
+ uint batch_id = 0; // batch id
+ uint group_id = 0; // group id
+ uint channel_id = 0; // channel id within the group
+
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+ const uint x = get_global_id(0) * VEC_SIZE;
+ const uint y = get_global_id(1) * 2;
+ const uint z = channel_id * NUM_GROUPS + group_id;
+
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+ // Store blocks
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+ VSTORE(VEC_SIZE)
+ (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+ VSTORE(VEC_SIZE)
+ (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..b9f223fe9d
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % r;
+
+ const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+ const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ __global float *scale)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+ // Load data
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = scale[get_global_id(2)];
+
+ // Dequantize
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else // !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
new file mode 100644
index 0000000000..8ab2d1d4ea
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
+#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
+#elif STRIDE_X == 2
+#define INPUT_PIXEL(data_size) extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL(data_size) extract_input_stride1
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X == 3 */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
+{
+ return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp = vload16(0, input_pixel);
+ return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp1 = vload4(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp2 = vload4(0, input_pixel + 6);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp3 = vload4(0, input_pixel + 12);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp4 = vload4(0, input_pixel + 18);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp1 = vload8(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp2 = vload8(0, input_pixel + 8);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp3 = vload8(0, input_pixel + 16);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp1 = vload16(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp2 = vload16(0, input_pixel + 12);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+ values = 0;
+
+ const uint z_index = get_global_id(2);
+
+ weights.ptr += z_index * weights_stride_w;
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
+ values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
+ src.ptr += src_stride_z;
+ weights.ptr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
+ ({ \
+ acc.s0 = mad(src.s0, weight_value, acc.s0); \
+ acc.s1 = mad(src.s1, weight_value, acc.s1); \
+ acc.s2 = mad(src.s2, weight_value, acc.s2); \
+ acc.s3 = mad(src.s3, weight_value, acc.s3); \
+ })
+
+/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 acc0 = 0.0f;
+ float4 acc1 = 0.0f;
+ float4 acc2 = 0.0f;
+ float4 acc3 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights
+ float weight = *((__global float *)weights_addr);
+
+ // Load values from row0 of input tensor
+ float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ CONVOLUTION1x1_BIFROST(acc0, src0, weight);
+ CONVOLUTION1x1_BIFROST(acc1, src1, weight);
+ CONVOLUTION1x1_BIFROST(acc2, src2, weight);
+ CONVOLUTION1x1_BIFROST(acc3, src3, weight);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ acc0.s0 += bias;
+ acc0.s1 += bias;
+ acc0.s2 += bias;
+ acc0.s3 += bias;
+ acc1.s0 += bias;
+ acc1.s1 += bias;
+ acc1.s2 += bias;
+ acc1.s3 += bias;
+ acc2.s0 += bias;
+ acc2.s1 += bias;
+ acc2.s2 += bias;
+ acc2.s3 += bias;
+ acc3.s0 += bias;
+ acc3.s1 += bias;
+ acc3.s2 += bias;
+ acc3.s3 += bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+ vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
new file mode 100644
index 0000000000..811df053c4
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ weights_values0 = vload3(0, weights_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ src0 = vload8(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ src1 = vload2(0, src_row_ptr + 8); \
+ \
+ acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+ })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ weights_values0 = vload3(0, weights_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 16) \
+ src0 = vload16(0, src_row_ptr); \
+ DATA_TYPE src1 = *(src_row_ptr + 16); \
+ \
+ acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+ })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+ values0 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ const int kernel_index = get_global_id(2);
+ weights_addr += kernel_index * weights_stride_w;
+
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
+ ({ \
+ acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \
+ acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \
+ acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \
+ acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \
+ acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \
+ acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \
+ acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \
+ acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \
+ acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \
+ })
+
+/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 values0 = 0;
+ float4 values1 = 0;
+ float4 values2 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights
+ float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+ float4 src0;
+ float2 src1;
+
+ // Load values from row0 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
+
+ CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
+
+ // Load values from row1 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
+ CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
+
+ // Load values from row2 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
+ CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
+ CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
+
+ // Load values from row3 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
+ CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
+
+ // Row4
+ src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ values0 += (float4)bias;
+ values1 += (float4)bias;
+ values2 += (float4)bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
new file mode 100644
index 0000000000..59d668f0bf
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ weights_values0 = vload4(0, weights_row_ptr); \
+ DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ src0 = vload8(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ src1 = vload4(0, src_row_ptr + 8); \
+ \
+ acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
+ })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ weights_values0 = vload4(0, weights_row_ptr); \
+ DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 16) \
+ src0 = vload16(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ src1 = vload4(0, src_row_ptr + 16); \
+ acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+ \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
+ })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ values0 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ const int kernel_index = get_global_id(2);
+ weights_addr += kernel_index * weights_stride_w;
+
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
+ ({ \
+ acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \
+ acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \
+ acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \
+ acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \
+ acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \
+ acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \
+ acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \
+ acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \
+ acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \
+ acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \
+ acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \
+ acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \
+ acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \
+ acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \
+ acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \
+ acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \
+ acc.s0 = mad(src0.s4, weights_row01, acc.s0); \
+ acc.s1 = mad(src0.s5, weights_row01, acc.s1); \
+ acc.s2 = mad(src0.s6, weights_row01, acc.s2); \
+ acc.s3 = mad(src0.s7, weights_row01, acc.s3); \
+ })
+
+/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 values0 = 0.0f;
+ float4 values1 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights from row0 and row1
+ float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
+ float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
+ float8 src0;
+
+ // Load values from row0 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+
+ // Load values from row1 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+ CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+ // Load values from row2 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Load weights from row2
+ weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+ weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+ CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+ // Load values from row3 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ // Load weights from row3
+ weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
+ weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+ CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+ // Load values from row4 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+
+ // Load weights from row4
+ weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
+ weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
+
+ CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+ CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+ // Load values from row5 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ values0 += bias;
+ values1 += bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
new file mode 100644
index 0000000000..b80d4f587e
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#if KERNEL_SIZE == 9
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \
+ int weights_value1 = convert_int(*(weights_row_ptr + 8)); \
+ int16 src0 = convert_int16(vload16(0, src_row_ptr)); \
+ acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \
+ })
+
+#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int8 weights_values0 = convert_int8(vload8(0, weights_row_ptr)); \
+ int weights_value1 = convert_int(*(weights_row_ptr + 8)); \
+ int16 src0 = convert_int16(vload16(0, src_row_ptr)); \
+ int8 src1 = convert_int8(vload8(0, src_row_ptr + 16)); \
+ acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \
+ })
+
+#elif KERNEL_SIZE == 5
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \
+ int weights_value1 = convert_int(*(weights_row_ptr + 4)); \
+ int8 src0 = convert_int8(vload8(0, src_row_ptr)); \
+ int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \
+ acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \
+ })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \
+ int weights_value1 = convert_int(*(weights_row_ptr + 4)); \
+ int16 src0 = convert_int16(vload16(0, src_row_ptr)); \
+ int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \
+ acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET); \
+ })
+
+#elif KERNEL_SIZE == 3
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \
+ int8 src0 = convert_int8(vload8(0, src_row_ptr)); \
+ int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \
+ acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \
+ int16 src0 = convert_int16(vload16(0, src_row_ptr)); \
+ int src1 = convert_int(*(src_row_ptr + 16)); \
+ acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+ acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+ })
+
+#elif KERNEL_SIZE == 1
+
+#if STRIDE_X == 3
+#define INPUT_VALUE extract_input_stride3
+#elif STRIDE_X == 2
+#define INPUT_VALUE extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_VALUE extract_input_stride1
+
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
+{
+ return vload8(0, input_value);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp = vload16(0, input_value);
+ return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp1 = vload16(0, input_value);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp2 = vload16(0, input_value + 12);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
+#error "Only kernel sizes 1, 3, 5 and 9 are supported"
+#endif /* KERNEL_SIZE */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
+ * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
+ * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Supported data types: S32
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution_quantized(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int8 values0 = 0;
+
+ __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
+ __global DATA_TYPE *src_addr = (__global DATA_TYPE *)offset(&src, 0, 0);
+
+ const int kernel_index = get_global_id(2);
+ weights_addr += kernel_index * weights_stride_w;
+
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+#if KERNEL_SIZE == 9
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
+ CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
+#elif KERNEL_SIZE == 5
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+ CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+#elif KERNEL_SIZE == 3
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+#elif KERNEL_SIZE == 1
+ int weight = convert_int(*(__global DATA_TYPE *)weights_addr);
+ int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
+ values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET);
+#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
+ values0 += (int8)(*bias_addr);
+#endif /* defined(HAS_BIAS) */
+
+#if OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
+ values0 = values0 + OUTPUT_OFFSET;
+
+ vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/nchw/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
new file mode 100644
index 0000000000..fddf918c63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col1x1_stridex1_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
+ const uint yc = get_global_id(1); // y coordinate in the convolved tensor
+ const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Clamp xc
+ // The strategy clamps at "xc" as it will be a valid value for sure
+ uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
+
+ // Check which values are valid
+ const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+
+ xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
+
+ // Calculate input indices
+ const uint xi = xc;
+ const uint yi = yc * STRIDE_Y;
+
+ // Calculate output indices
+
+#if defined(NUM_GROUPS)
+ const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+ const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const uint xo = ch;
+#endif // defined(NUM_GROUPS)
+ const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ data = vload4(0, (__global DATA_TYPE *)input_ptr);
+
+ // If out-of-bound, overwrite with the first element
+ data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
+
+ *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
+ *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
+ *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
+ *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
+ *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
+ *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
+ *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X - PAD_LEFT;
+ const int yi = yc * STRIDE_Y - PAD_TOP;
+
+ // Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else // defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+ // Linearize convolution elements
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ int y = yi + yk * DILATION_Y;
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
+ {
+ int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+ {
+ *output_ptr = PAD_VALUE;
+ }
+ else
+ {
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+ }
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ }
+ }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *output_ptr = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
+
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X - PAD_LEFT;
+ const int yi = yc * STRIDE_Y - PAD_TOP;
+
+ // Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 9; // 3x3
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ // Put 0 if the value is out-of-bound
+ int3 x = (int3)xi + (int3)(0, 1, 2);
+ int3 y = (int3)yi + (int3)(0, 1, 2);
+
+ VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+ cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+ VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+ cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+ VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+ cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+
+ row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+ row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
+ row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+ *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col5x5_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X - PAD_LEFT;
+ const int yi = yc * STRIDE_Y - PAD_TOP;
+
+ // Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 25; // 5x5
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ // Put 0 if the value is out-of-bound
+ int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
+ int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
+ int x1 = xi + 4;
+ int y1 = yi + 4;
+
+ // Check if we could have out-of-bounds elements in the x direction
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+ COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
+ COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+ DATA_TYPE
+ row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+ input_ptr += src_stride_y;
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+ DATA_TYPE
+ row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
+ COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
+ COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
+
+ // Replace with 0 if the value is not valid
+ row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+ row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+ row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+ row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+ row10.s012),
+ 0, (__global DATA_TYPE *)output_ptr);
+ vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 10 * dst_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+ DATA_TYPE
+ row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+ input_ptr += src_stride_y;
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+ DATA_TYPE
+ row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
+ COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
+ COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
+
+ // Replace with 0 if the value is not valid
+ row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+ row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+ row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+ row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+ row10.s012),
+ 0, (__global DATA_TYPE *)output_ptr);
+ vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 10 * dst_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+ DATA_TYPE
+ row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+ input_ptr += src_stride_y;
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+ cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
+ COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
+
+ // Replace with 0 if the value is not valid
+ row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+ row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
+ *((__global DATA_TYPE *)output_ptr + 4) = row01;
+
+ output_ptr += 5 * dst_stride_x;
+ }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *((__global DATA_TYPE *)output_ptr) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col11x11_padx0_pady0_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X;
+ const int yi = yc * STRIDE_Y;
+
+ // Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * 121; // 11x11
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else // defined(NUM_GROUPS)
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ input_ptr += src_stride_y;
+ output_ptr += 11 * src_stride_x;
+ }
+
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+ vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+ output_ptr += 11 * src_stride_x;
+ }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *((__global DATA_TYPE *)output_ptr) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0_nchw(
+ TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+ TENSOR3D_DECLARATION(dst),
+#else // defined(NUM_GROUPS)
+ IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+ // Calculate input indices
+ const int xi = xc * STRIDE_X;
+ const int yi = yc * STRIDE_Y;
+
+ // Calculate output indices
+#if defined(NUM_GROUPS)
+ const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else // defined(NUM_GROUPS)
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif // defined(NUM_GROUPS)
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else // defined(NUM_GROUPS)
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+ // Linearize convolution elements
+ for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+ {
+ int last_x = 0;
+ for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+ VSTORE(VECTOR_SIZE)
+ (row, 0, output_ptr);
+ last_x = x;
+ }
+ // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+ // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+ VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+ row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+ VSTORE(WIDTH_MOD_VECTOR_SIZE)
+ (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+ output_ptr += WIDTH_MOD_VECTOR_SIZE;
+ } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+ if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else // defined(NUM_GROUPS)
+ if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+ {
+ *output_ptr = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
new file mode 100644
index 0000000000..0fef98e295
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+ const int current_slice = get_global_id(2);
+ const int left_slice = max(-(int)RADIUS, -current_slice);
+ const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+ for(int i = left_slice; i <= right_slice; i++)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+ VSTORE(VEC_SIZE)
+ (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = 0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = SQCVT_SAT(KAPPA);
+
+ const int current_col = get_global_id(0) << 2;
+ const int left_pos = max(-(int)RADIUS, -3 - current_col);
+ const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+
+#if defined(IN_MAP_2D)
+ const int current_row = get_global_id(1);
+ const int first_row = max(-(int)RADIUS, -current_row);
+ const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+ for(int j = first_row; j <= last_row; ++j)
+ {
+#endif /* defined(IN_MAP_2D) */
+ for(int i = left_pos; i <= right_pos; ++i)
+ {
+#if defined(IN_MAP_2D)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else /* defined(IN_MAP_2D) */
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+#if defined(IN_MAP_2D)
+ }
+#endif /* defined(IN_MAP_2D) */
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+ VSTORE(VEC_SIZE)
+ (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..23a0de76f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+ const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+ TYPE res = (data - curr_mean) / curr_std;
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..0f02ef6184
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+ data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+ // Perform normalization
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+ const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+ VSTORE(VEC_SIZE)
+ (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..790ddb381a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+ CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ vdata = INITIAL_VALUE;
+ ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE_Y; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+ {
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE_X; ++x)
+ {
+ ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+ reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+ reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+ const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+ const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+ const int x = get_global_id(0) * STRIDE_X;
+ const int y = get_global_id(1) * STRIDE_Y;
+ const int z = get_global_id(2);
+
+ //x axis: width, y axis: height, z axis: component
+ const uint padded_offset = input->offset_first_element_in_bytes
+ + x * input->stride_x
+ + y * input->stride_y
+ + z * input->stride_z;
+
+ const uint offset_base = padded_offset
+ - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */
+ - PAD_TENSOR_TOP * input->stride_y /* top padding */
+ - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+ - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+ *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+ *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+ *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+ return;
+}
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ TENSOR3D_DECLARATION(indices))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+ // Load data
+ float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+ float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+ // Perform calculations
+ float data0_max = POOL_OP(data0.s0, data0.s1);
+ float data1_max = POOL_OP(data1.s0, data1.s1);
+ float res = POOL_OP(data0_max, data1_max);
+ // Store result
+ *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+ uint offset_top = 0;
+ uint offset_bottom = 0;
+
+ offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+ uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+ uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+ uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+ *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ TENSOR3D_DECLARATION(indices))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+ // Load data
+ half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+ half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+ // Perform calculations
+ half data0_max = POOL_OP(data0.s0, data0.s1);
+ half data1_max = POOL_OP(data1.s0, data1.s1);
+ half res = POOL_OP(data0_max, data1_max);
+ // Store result
+ *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+ uint offset_top = 0;
+ uint offset_bottom = 0;
+
+ offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+ uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+ uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+ uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+ *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..1440ef3ed1
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int8 vdata = INITIAL_VALUE;
+ int sdata = INITIAL_VALUE;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE_Y; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+ {
+ VEC_TYPE(8)
+ data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE_X; ++x)
+ {
+ DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int data0 = convert_int(data);
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ int res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+ res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+ DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+ const float result_f32 = convert_float(result_q8);
+ const float input_offset = (float)OFFSET_IN1;
+ const float input_scale = (float)SCALE_IN1;
+ const float scale_out = (float)SCALE_OUT;
+ const float offset_out = (float)OFFSET_OUT;
+ const float in_f32 = (result_f32 - input_offset) * input_scale;
+ const float out_f32 = in_f32 / scale_out + offset_out;
+ result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ *(__global DATA_TYPE *)output.ptr = result_q8;
+}
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
new file mode 100644
index 0000000000..7524ba7b4a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/** Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx Index to write to
+ * @param[in] center_x Center value of the x axis
+ * @param[in] center_y Center value of the y axis
+ * @param[in] box_width Prior box width
+ * @param[in] box_height Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+ float xmin = (center_x - box_width / 2.f) / WIDTH;
+ float ymin = (center_y - box_height / 2.f) / HEIGHT;
+ float xmax = (center_x + box_width / 2.f) / WIDTH;
+ float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+ xmin = clamp(xmin, 0.f, 1.f);
+ ymin = clamp(ymin, 0.f, 1.f);
+ xmax = clamp(xmax, 0.f, 1.f);
+ ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+ // Store result
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] min_size Prior box min size
+ * @param[in] min_idx Index of the min vector
+ * @param[in] idx Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+ const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+ const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+ float box_width = min_size;
+ float box_height = min_size;
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+
+ if(max_size > 0)
+ {
+ box_width = sqrt(min_size * max[min_idx]);
+ box_height = box_width;
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+ for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+ {
+ if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+ {
+ continue;
+ }
+ box_width = min_size * sqrt(aspect_ratios[i]);
+ box_height = min_size * rsqrt(aspect_ratios[i]);
+
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+
+ return idx;
+}
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] min The minimum values
+ * @param[in] max The maximum_values
+ * @param[in] aspect_ratios The aspect ratio values
+ * @param[in] min_size The minimum values size
+ * @param[in] max_size The maximum_values values size
+ * @param[in] aspect_ratios_size The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+ unsigned int aspect_ratios_size)
+{
+ Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+ int idx = 0;
+ for(unsigned int i = 0; i < min_size; ++i)
+ {
+ idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+ }
+
+ // Store variances
+ for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+ {
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+ }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl
new file mode 100644
index 0000000000..fab88a1682
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/remap.cl
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#ifndef DEPTH_OUT
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ * out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] width Width of the input image
+ * @param[in] height Height of the input image
+ */
+__kernel void remap_nearest_neighbour_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ IMAGE_DECLARATION(mapx),
+ IMAGE_DECLARATION(mapy),
+ const float width,
+ const float height)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+ Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+ float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+ float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+ float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+ mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+ vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ * out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] width Width of the input image
+ * @param[in] height Height of the input image
+ */
+__kernel void remap_bilinear_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ IMAGE_DECLARATION(mapx),
+ IMAGE_DECLARATION(mapy),
+ const float width,
+ const float height)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+ Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+ float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+ float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+ float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+ mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+ vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
+#endif // DEPTH_OUT \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
new file mode 100644
index 0000000000..f66b17c1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \
+ ({ \
+ int offset = zo / (int)SRC_DEPTH; \
+ xi = xo * (int)STRIDE + offset % (int)STRIDE; \
+ yi = yo * (int)STRIDE + offset / (int)STRIDE; \
+ zi = zo % SRC_DEPTH; \
+ })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int xo = get_global_id(0);
+ int yo = get_global_id(1);
+ int zo = get_global_id(2);
+ int xi, yi, zi;
+
+ CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+ int src_offset = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..63a53cc4f2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ float8 transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+ transformed = round(transformed);
+#endif // ALIGN_CORNERS
+ const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
+ vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear(get_current_coords(), r);
+ vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
new file mode 100644
index 0000000000..946ad65c14
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r);
+ vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
new file mode 100644
index 0000000000..e162a29bb0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
+ * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+ TENSOR4D_DECLARATION(input),
+ IMAGE_DECLARATION(paddings),
+ VECTOR_DECLARATION(block_shape),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
+ const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+ const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
+ const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+ int block_x = *((__global int *)vector_offset(&block, 0));
+ int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int out_x = get_global_id(0);
+ const int out_y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - pad_left_x;
+ const int in_y = pos_y - pad_left_y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+ }
+}
+
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int block_x = BLOCK_SHAPE_X;
+ int block_y = BLOCK_SHAPE_Y;
+
+ const int out_x = get_global_id(0);
+ const int out_y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - PAD_LEFT_X;
+ const int in_y = pos_y - PAD_LEFT_Y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..aea02e813b
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % r;
+
+ const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+ const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: All
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
+ const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+ src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+ dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+ vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+ vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0);
+ out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+ out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+ out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out1 = 0.0f;
+ out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+ out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out2 = 0.0f;
+ out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+ out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+ out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+ out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out3 = 0.0f;
+ out3.s0 = (w2.s0);
+ out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+ out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+ out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ // 16 channels for 3x3 kernels
+ // 4 channels for 3x1 or 1x3 kernels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0) / 16.f;
+ out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+ out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+ out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+ out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (w2.s0) / 4.f;
+ out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+ out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+ out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ // 36 channels for 3x3 kernels
+ // 6 channels for 3x1 or 1x3 kernels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Transform the input tile
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = w00.s0;
+ out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+ out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+ out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+ out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+ out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+ out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+ out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+ out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+ // Row 6
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+ out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+ out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+ // Row 7
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+ out7.s0 = w40.s0;
+ out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+ out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+ out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+ out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+ out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
new file mode 100644
index 0000000000..8c382183c3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -0,0 +1,1346 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
+ comm_fact.s2 = 2.5f * tmp.s3; \
+ comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+ comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
+ comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+ \
+ out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+ out.s1 = comm_fact.s0 + comm_fact.s1; \
+ out.s2 = comm_fact.s0 - comm_fact.s1; \
+ out.s3 = comm_fact.s3 + comm_fact.s4; \
+ out.s4 = comm_fact.s4 - comm_fact.s3; \
+ out.s5 = comm_fact.s5 + comm_fact.s6; \
+ out.s6 = comm_fact.s5 - comm_fact.s6; \
+ out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+ })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
+ comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
+ comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
+ out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
+ out.s1 = comm_fact.s0 - comm_fact.s1; \
+ out.s2 = comm_fact.s0 + comm_fact.s1; \
+ out.s3 = comm_fact.s2 - comm_fact.s3; \
+ out.s4 = comm_fact.s2 + comm_fact.s3; \
+ out.s5 = comm_fact.s4 - comm_fact.s5; \
+ out.s6 = comm_fact.s4 + comm_fact.s5; \
+ out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+ })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute input address
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp0 = in_row0;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ tmp0 -= in_row2;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+ DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+ DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+ DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp1 = in_row1 + in_row2;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp2 = in_row2 - in_row1;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp3 = in_row1 - in_row3;
+
+ DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+ DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+ DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+ DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
+
+ DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+ DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+ DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+ DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
+
+ DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+ DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+ DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+ DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+ const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute input address
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ src_addr += src_stride_z;
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp4 = in_row4;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ tmp0 -= in_row2;
+ tmp4 -= in_row6;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp1 = in_row1 + in_row2;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp2 = in_row2 - in_row1;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp3 = in_row1 - in_row3;
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp5 = in_row5 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp6 = in_row6 - in_row5;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp7 = in_row5 - in_row7;
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+ vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+ vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+ vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+ vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+ vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+ vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+ vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+ vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+ vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+ vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+ vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+ vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+ vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+ vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+ vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute input address
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ // Row0
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
+#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ // Row0
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE out0 = 0.0f;
+ DATA_TYPE out1 = 0.0f;
+ DATA_TYPE out2 = 0.0f;
+ DATA_TYPE out3 = 0.0f;
+ DATA_TYPE out4 = 0.0f;
+ DATA_TYPE out5 = 0.0f;
+
+ // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+ out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
+ out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
+ out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
+ out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
+ out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
+ out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ // Row4
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+
+ // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
+ DATA_TYPE k0 = d41.s0;
+ DATA_TYPE k1 = d41.s0;
+ DATA_TYPE k2 = d41.s0;
+ DATA_TYPE k3 = d41.s0;
+ DATA_TYPE k4 = d41.s0;
+ DATA_TYPE k5 = 0.0f;
+
+ k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+ k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+ k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+ k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+ k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+ k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+ out0 += k0;
+ out1 += k1;
+ out2 += k2;
+ out3 += k3;
+ out4 += k4;
+ out5 += k5;
+
+ // Row2
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+
+ out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
+ out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
+ out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
+ out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
+ out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
+ out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Compute destination address
+#if defined(SRC_DEPTH)
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
+
+ uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ DATA_TYPE out6 = k0;
+ DATA_TYPE out7 = k1;
+ DATA_TYPE out8 = k2;
+ DATA_TYPE out9 = k3;
+ DATA_TYPE out10 = k4;
+ DATA_TYPE out11 = k5;
+ DATA_TYPE out12 = k0;
+ DATA_TYPE out13 = k1;
+ DATA_TYPE out14 = k2;
+ DATA_TYPE out15 = k3;
+ DATA_TYPE out16 = k4;
+ DATA_TYPE out17 = k5;
+ DATA_TYPE out18 = k0;
+ DATA_TYPE out19 = k1;
+ DATA_TYPE out20 = k2;
+ DATA_TYPE out21 = k3;
+ DATA_TYPE out22 = k4;
+ DATA_TYPE out23 = k5;
+ DATA_TYPE out24 = k0;
+ DATA_TYPE out25 = k1;
+ DATA_TYPE out26 = k2;
+ DATA_TYPE out27 = k3;
+ DATA_TYPE out28 = k4;
+ DATA_TYPE out29 = k5;
+
+ // Row1
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+
+ // Row3
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+
+ // Compute common parts for the channels between [6, 29]
+ // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
+ // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+ DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+ DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
+ DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+ DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+ DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+ DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+ DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
+ DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+ DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+ // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+ // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+ DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+ DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+ DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+ DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+ DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+ DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+ DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+ DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+ DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+ DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
+ DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+ DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+ out6 += part0 - part1;
+ out12 += part0 + part1;
+ out7 += part2 + part3 + part4 + part5;
+ out8 += part2 - part3 + part4 - part5;
+ out13 += part2 + part3 - part4 - part5;
+ out14 += part2 - part3 - part4 + part5;
+ out9 += part6 + part7 + part8 + part9;
+ out10 += part6 - part7 + part8 - part9;
+ out15 += part6 - part7 - part8 + part9;
+ out16 += part6 + part7 - part8 - part9;
+ out11 += part10 + part11;
+ out17 += part10 - part11;
+
+ out18 += part13 - part12;
+ out24 += part13 + part12;
+ out19 += part14 + part15 + part16 + part17;
+ out20 += part14 - part15 + part16 - part17;
+ out25 += part14 - part15 - part16 + part17;
+ out26 += part14 + part15 - part16 - part17;
+ out21 += part18 + part19 + part20 + part21;
+ out22 += part18 - part19 + part20 - part21;
+ out27 += part18 - part19 - part20 + part21;
+ out28 += part18 + part19 - part20 - part21;
+ out23 += part22 + part23;
+ out29 += part22 - part23;
+
+ *(dst_addr) = out6;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out7;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out8;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out9;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out10;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out11;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out12;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out13;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out14;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out15;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out16;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out17;
+ dst_addr += dst_plane_stride;
+
+ *(dst_addr) = out18;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out19;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out20;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out21;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out22;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out23;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out24;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out25;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out26;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out27;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out28;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out29;
+ dst_addr += dst_plane_stride;
+
+ // Row5
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+
+ // Channels [30, 35]
+ out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute input address
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+ // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Calculate common factors for intermediate tensor
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+ tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+ comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+ comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+ comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+ comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Calculate output rows (reuse comm_fact0 vector)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0;
+
+ OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1, out2, out3, out4, out5, out6, out7;
+
+ OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Store values across the channels
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+ *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+ *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+ *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+ *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+ *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+ *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+ *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+ *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+ *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+ *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+ *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+ *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+ *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+ *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+ *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+ *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+ *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+ *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+ *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+ *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+ *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+ *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+ *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+ *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+ *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+ *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+ *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+ *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+ *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+ *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+ *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+ *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+ *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+ *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+ *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+ *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+ *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+ *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+ *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+ *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+ *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+ *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+ *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+ *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+ *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+ *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+ *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+ *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
new file mode 100644
index 0000000000..861ed50651
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+ // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Compute the 2x1 or 1x2 output tile
+ // out00 = d00 + d01 + d02
+ // out01 = d01 - d02 - d03
+
+ float out00 = d00 + d01 + d02;
+ float out01 = d01 - d02 - d03;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+ // Compute the 2x2 output tile
+ float k0 = d01 + d11 + d21;
+ float k1 = d02 + d12 + d22;
+ float k2 = d11 - d21 - d31;
+ float k3 = d12 - d22 - d32;
+
+ // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+ // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+ // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+ // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+ float out00 = d10;
+ float out01 = -d13;
+ float out10 = d10;
+ float out11 = -d13;
+
+ out00 += d00 + d20 + k0 + k1;
+ out01 += k0 - k1 - (d03 + d23);
+ out10 += -d20 - d30 + k2 + k3;
+ out11 += k2 - k3 + d23 + d33;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+ int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+ (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+ // Add bias
+ out10 += (DATA_TYPE)b;
+ out11 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+ vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+ (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+ // Load the values across the channels to compose the 6x6 or 6x1 tile
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Compute out00, out01, out02 and out03
+ float out00 = d00 + d01 + d02 + d03 + d04;
+ float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+ float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+ float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+ // Compute out00, out01, out02 and out03
+ float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
+
+ float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+ float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+ out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+ out01 += k1 - d02 - d12 - d22 - d32 - d42;
+ out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+ out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+ // Compute out10, out11, out12 and out13
+ float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+ k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+ out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+ out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+ out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+ out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+ // Compute out20, out21, out22 and out23
+ float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+ k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+ out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+ out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+ out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+ out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+ // Compute out30, out31, out32 and out33
+ float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+ k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+ out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+ out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+ out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+ out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+ int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+ out02 += (float)b;
+ out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ // Store the output tile
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+ // Add bias
+ out10 += (float)b;
+ out11 += (float)b;
+ out12 += (float)b;
+ out13 += (float)b;
+
+ out20 += (float)b;
+ out21 += (float)b;
+ out22 += (float)b;
+ out23 += (float)b;
+
+ out30 += (float)b;
+ out31 += (float)b;
+ out32 += (float)b;
+ out33 += (float)b;
+#endif // defined(HAS_BIAS)
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+ (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+ (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+ (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
+ ({ \
+ comm_fact.s0 = d1 + d2; \
+ comm_fact.s1 = d3 + d4; \
+ comm_fact.s2 = d5 + d6; \
+ \
+ col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \
+ col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+ \
+ comm_fact.s0 = d1 - d2; \
+ comm_fact.s1 = d3 - d4; \
+ comm_fact.s2 = d5 - d6; \
+ \
+ col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+ col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \
+ })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute output address
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+ int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ // Load the values across the channels to compose the input tile
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Compute out00, out01, out02 and out03
+ float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+ float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+ float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+ float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+ out00 += (DATA_TYPE)b;
+ out01 += (DATA_TYPE)b;
+ out02 += (DATA_TYPE)b;
+ out03 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+
+ // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
+ B_VAL),
+ VEC_DATA_TYPE(DATA_TYPE, 4));
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
+ 0, (__global DATA_TYPE *)(dst_addr));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+ DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+ DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+ DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+ DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+ DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+ DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+ DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+ DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+ DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+ DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+ DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+ DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+ DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+ DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+ DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+ DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+ DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+ DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+ DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+ DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+ // Compute the 8x4 intermediate tensor
+ VEC_DATA_TYPE(float, 4)
+ comm_fact0, comm_fact1, comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+ COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+ COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+ // Compute the 4x4 output tile
+ comm_fact0 = tmp_col1 + tmp_col2;
+ comm_fact1 = tmp_col3 + tmp_col4;
+ comm_fact2 = tmp_col5 + tmp_col6;
+
+ VEC_DATA_TYPE(float, 4)
+ out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+ VEC_DATA_TYPE(float, 4)
+ out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
+
+ comm_fact0 = tmp_col1 - tmp_col2;
+ comm_fact1 = tmp_col3 - tmp_col4;
+ comm_fact2 = tmp_col5 - tmp_col6;
+
+ VEC_DATA_TYPE(float, 4)
+ out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+ out_col0 += (VEC_DATA_TYPE(float, 4))b;
+ out_col1 += (VEC_DATA_TYPE(float, 4))b;
+ out_col2 += (VEC_DATA_TYPE(float, 4))b;
+ out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+
+ // Store the output tile
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
+ VEC_DATA_TYPE(DATA_TYPE, 4)),
+ 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
+ VEC_DATA_TYPE(DATA_TYPE, 4)),
+ 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
+ VEC_DATA_TYPE(DATA_TYPE, 4)),
+ 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
+ VEC_DATA_TYPE(DATA_TYPE, 4)),
+ 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ winograd_output_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+ ,
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+ );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)