aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/nhwc
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/nhwc')
-rw-r--r--src/core/CL/cl_kernels/nhwc/batch_to_space.cl131
-rw-r--r--src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl146
-rw-r--r--src/core/CL/cl_kernels/nhwc/channel_shuffle.cl160
-rw-r--r--src/core/CL/cl_kernels/nhwc/depth_to_space.cl69
-rw-r--r--src/core/CL/cl_kernels/nhwc/dequantization_layer.cl87
-rw-r--r--src/core/CL/cl_kernels/nhwc/direct_convolution.cl295
-rw-r--r--src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl281
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl211
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl275
-rw-r--r--src/core/CL/cl_kernels/nhwc/im2col.cl526
-rw-r--r--src/core/CL/cl_kernels/nhwc/indirect_convolution.cl305
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalization_layer.cl177
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl81
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl96
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl197
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl185
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_layer.cl364
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl164
-rw-r--r--src/core/CL/cl_kernels/nhwc/reorg_layer.cl76
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale.cl245
-rw-r--r--src/core/CL/cl_kernels/nhwc/space_to_batch.cl155
-rw-r--r--src/core/CL/cl_kernels/nhwc/space_to_depth.cl69
-rw-r--r--src/core/CL/cl_kernels/nhwc/transposed_convolution.cl297
-rw-r--r--src/core/CL/cl_kernels/nhwc/upsample_layer.cl80
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl1107
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl1050
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl1109
27 files changed, 7938 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/batch_to_space.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
new file mode 100644
index 0000000000..b910a753a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ VECTOR_DECLARATION(block_shape),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+ const int in_x = x / block_x;
+ const int in_y = y / block_y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
+
+ const int x = get_global_id(1) + CROP_LEFT;
+ const int y = get_global_id(2) + CROP_TOP;
+ const int z = get_global_id(0);
+
+ const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+ const int in_x = x / block_x;
+ const int in_y = y / block_y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
+{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+ __global uchar *output_addr = input_ptr;
+#else /* IN_PLACE */
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+ __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res0 = 0;
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+ denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+ // Calculate x bar and store results
+ numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+ numerator = SUB_OP(data, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+ res0 = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+ // beta is not zero, hence we need to perform the addition
+ res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+ res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+ STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
new file mode 100644
index 0000000000..233beb3aa9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -0,0 +1,160 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res) \
+ ({ \
+ div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+ uint r = div_res * (y); \
+ mod_res = (x)-r; \
+ })
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: All
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ // Offset computation
+ const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
+
+ uint z = 0;
+ uint batch_id = 0;
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
+
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
+
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
+
+ // Load the values
+ const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
+
+#if VEC_SIZE == 1
+ DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
+#elif VEC_SIZE == 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0 =
+ {
+ *(input_ptr + in_channels.s0),
+ *(input_ptr + in_channels.s1)
+ };
+#elif VEC_SIZE == 3
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ out0 =
+ {
+ *(input_ptr + in_channels.s0),
+ *(input_ptr + in_channels.s1),
+ *(input_ptr + in_channels.s2)
+ };
+#elif VEC_SIZE == 4
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0 =
+ {
+ *(input_ptr + in_channels.s0),
+ *(input_ptr + in_channels.s1),
+ *(input_ptr + in_channels.s2),
+ *(input_ptr + in_channels.s3)
+ };
+#elif VEC_SIZE == 8
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 =
+ {
+ *(input_ptr + in_channels.s0),
+ *(input_ptr + in_channels.s1),
+ *(input_ptr + in_channels.s2),
+ *(input_ptr + in_channels.s3),
+ *(input_ptr + in_channels.s4),
+ *(input_ptr + in_channels.s5),
+ *(input_ptr + in_channels.s6),
+ *(input_ptr + in_channels.s7)
+ };
+#elif VEC_SIZE == 16
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ out0 =
+ {
+ *(input_ptr + in_channels.s0),
+ *(input_ptr + in_channels.s1),
+ *(input_ptr + in_channels.s2),
+ *(input_ptr + in_channels.s3),
+ *(input_ptr + in_channels.s4),
+ *(input_ptr + in_channels.s5),
+ *(input_ptr + in_channels.s6),
+ *(input_ptr + in_channels.s7),
+ *(input_ptr + in_channels.s8),
+ *(input_ptr + in_channels.s9),
+ *(input_ptr + in_channels.sa),
+ *(input_ptr + in_channels.sb),
+ *(input_ptr + in_channels.sc),
+ *(input_ptr + in_channels.sd),
+ *(input_ptr + in_channels.se),
+ *(input_ptr + in_channels.sf)
+ };
+#endif // VEC_SIZE == 1
+
+ __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+ STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..84f8aa7263
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0) % r;
+
+ const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+ const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ __global float *scale)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+ scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+ // Load data
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+ // Dequantize
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else // !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
new file mode 100644
index 0000000000..81ceeb8846
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_c The size of the channels dimension of the weights tensor
+ * @param[in] wei_w The size of the width dimension of the weights tensor
+ * @param[in] wei_h The size of the height dimension of the weights tensor
+ * @param[in] wei_n The size of the batches dimension of the weights tensor
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+ // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // .v = access the whole vector (OpenCL vector)
+ // .s[x] = access the vector element at position x (scalar access)
+ TILE(int, 1, M0, xi);
+ TILE(int, 1, M0, yi);
+
+ // Convert the linear index to coordinate
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+ yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
+ xi[0].s[i] -= PAD_LEFT;
+ yi[0].s[i] -= PAD_TOP;
+ })
+
+ // Initialize the accumulators
+ TILE(ACC_DATA_TYPE, M0, N0, c);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
+
+ for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+ {
+ int xk = i % _IWEI_WIDTH;
+ int yk = i / _IWEI_WIDTH;
+
+ TILE(int, 1, M0, my);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ int x_s = xi[0].s[i] + xk;
+ int y_s = yi[0].s[i] + yk;
+ my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+ my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+ my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+ my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+ my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+ my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+ })
+
+ int ck = 0;
+ for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+ {
+ TILE(SRC_DATA_TYPE, M0, K0, a);
+ TILE(WEI_DATA_TYPE, N0, K0, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+ // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+ }
+
+ // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+ // Left-over accumulations
+ for(; ck < _ISRC_CHANNELS; ++ck)
+ {
+ TILE(SRC_DATA_TYPE, M0, 1, a);
+ TILE(WEI_DATA_TYPE, N0, 1, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+ // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+ }
+#endif // defined(LEFTOVER_LOOP)
+ }
+
+ // Offset correction required for the quantized asymmetric computation
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+ TILE(DST_DATA_TYPE, M0, N0, cq);
+
+ // Quantize the tile
+ T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+ // Apply activation
+ T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
new file mode 100644
index 0000000000..807b990e82
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution 3d.
+ *
+ * @note Data layout supported: NDHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left, top and front) must be passed at compile time using -DPAD_LEFT, -DPAD_TOP and -DPAD_FRONT (e.g. -DPAD_LEFT=2, -DPAD_TOP=2, -DPAD_FRONT=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2, -DSTRIDE_Z=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH, -DWEI_HEIGHT and -DWEI_DEPTH (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9, -DWEI_DEPTH=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64, -DSRC_DEPTH=32)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH, -DDST_HEIGHT and -DDST_DEPTH (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64, -DDST_DEPTH=32)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, .... n
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time along with its tensor type by using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=int).
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution3d_ndhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ TENSOR4D(wei, BUFFER)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IWEI_DEPTH WEI_DEPTH
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_DEPTH SRC_DEPTH
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_DEPTH DST_DEPTH
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH)
+
+ // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT x DEPTH
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ TILE(int, M0, 1, xi);
+ TILE(int, M0, 1, yi);
+ TILE(int, M0, 1, zi);
+
+ // Convert the linear index to coordinate
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+ yi[i].v = (((mout + i) / _IDST_WIDTH) % _IDST_HEIGHT) * STRIDE_Y;
+ zi[i].v = (((mout + i) / (_IDST_WIDTH * _IDST_HEIGHT)) % _IDST_DEPTH) * STRIDE_Z;
+
+ xi[i].v -= PAD_LEFT;
+ yi[i].v -= PAD_TOP;
+ zi[i].v -= PAD_FRONT;
+ })
+
+ // Initialize the accumulators
+ TILE(ACC_DATA_TYPE, M0, N0, c);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = (ACC_DATA_TYPE)0;
+ })
+
+ for(int i = 0; i < _IY_MULTIPLIER; ++i)
+ {
+ int ck = 0;
+ int xk = i % _IWEI_WIDTH;
+ int yk = (i / _IWEI_WIDTH) % _IWEI_HEIGHT;
+ int zk = i / (_IWEI_WIDTH * _IWEI_HEIGHT);
+
+ int k = 0;
+ for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+ {
+ TILE(DATA_TYPE, M0, K0, a);
+ TILE(DATA_TYPE, N0, K0, b);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, K0, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+ // Load tile from the weights tensor
+ const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ if((cout + i) < _IDST_CHANNELS)
+ {
+ LOOP_UNROLLING(int, j, 0, 1, K0,
+ {
+ b[i].s[j] = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + j * wei_stride_y + b_offs * wei_stride_y);
+ })
+ }
+ })
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+ // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+ ck += K0;
+ }
+
+#if((_ISRC_CHANNELS % K0) != 0)
+ // Left-over accumulations
+ for(; k < _ISRC_CHANNELS; ++k)
+ {
+ TILE(DATA_TYPE, M0, 1, a);
+ TILE(DATA_TYPE, N0, 1, b);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, 1, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+ // Load tile from the weights tensor
+ const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ if((cout + i) < _IDST_CHANNELS)
+ {
+ b[i].v = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + b_offs * wei_stride_y);
+ }
+ })
+
+ // // Compute the matrix multiplication between two tiles
+ T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+ // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+ ++ck;
+ }
+#endif // ((_ISRC_CHANNELS % K0) != 0)
+ }
+
+ // Offset correction required for the quantized asymmetric computation
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ if((cout + N0) <= _IDST_CHANNELS)
+ {
+ bias0[0].v = VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+ }
+ else
+ {
+ VLOAD_PARTIAL(N0, PARTIAL_N0)
+ (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+ }
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH);
+ })
+
+#if defined(IS_QUANTIZED)
+ TILE(DATA_TYPE, M0, N0, cq);
+
+ // Quantize the tile
+ T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
new file mode 100644
index 0000000000..dcbae220b6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+// *INDENT-OFF*
+// clang-format off
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ * - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_c The size of the channels dimension of the weights tensor
+ * @param[in] wei_w The size of the width dimension of the weights tensor
+ * @param[in] wei_h The size of the height dimension of the weights tensor
+ * @param[in] wei_n The size of the batches dimension of the weights tensor
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void dwc_native_fp_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Only the weight tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0 // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, M0, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
+
+ int xi = xo * STRIDE_X;
+ int yi = yo * STRIDE_Y;
+ xi -= PAD_LEFT;
+ yi -= PAD_TOP;
+
+ TILE(ACC_DATA_TYPE, M0, N0, c);
+
+ // Reset accumulators
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
+
+#if _IWEI_HEIGHT < 5
+ LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
+#endif // _IWEI_HEIGHT <= 5
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ {
+ a[i].v = 0;
+ })
+
+ // Load tile from the src tensor (TILE A)
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), SRC_WIDTH, SRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+ TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+ // Load tile from the weights tensor (TILE B)
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+ // Optimized path for STRIDE_X == 1
+ // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+ c[m0].v += a[xk + m0].v * b[xk].v;
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+ c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+ })
+ })
+ }
+#if _IWEI_HEIGHT < 5
+ )
+#endif // _IWEI_HEIGHT <= 5
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+ T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ if(x_cond)
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+ VSTORE_PARTIAL(N0, PARTIAL_N0)
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+ else
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+ VSTORE(N0)
+ (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
new file mode 100644
index 0000000000..2d255e5b61
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// *INDENT-OFF*
+// clang-format off
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128)
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
+ ({})
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL() \
+ TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers); \
+ TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts); \
+ T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \
+ T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts);
+
+#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
+#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
+
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for quantized data types
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively
+ * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3)
+ * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1)
+ * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ * - N0 = 2, 3, 4, 8, 16
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_c The size of the channels dimension of the weights tensor
+ * @param[in] wei_w The size of the width dimension of the weights tensor
+ * @param[in] wei_h The size of the height dimension of the weights tensor
+ * @param[in] wei_n The size of the batches dimension of the weights tensor
+ * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] dst_multipliers_ptr Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
+ * @param[in] dst_multipliers_stride_x Stride of the destination multipliers tensor in X dimension (in bytes)
+ * @param[in] dst_multipliers_step_x dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor
+ * @param[in] dst_shifts_ptr Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32
+ * @param[in] dst_shifts_stride_x Stride of the destination shifts tensor in X dimension (in bytes)
+ * @param[in] dst_shifts_step_x dst_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_shifts_offset_first_element_in_bytes The offset of the first element in the destination shifts tensor
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: S32
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ */
+//! @endcond
+__kernel void dwc_native_quantized_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE),
+ VECTOR_DECLARATION(dst_multipliers),
+ VECTOR_DECLARATION(dst_shifts)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Only the weight tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0 // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, M0, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
+
+ int xi = xo * STRIDE_X;
+ int yi = yo * STRIDE_Y;
+ xi -= PAD_LEFT;
+ yi -= PAD_TOP;
+
+ TILE(ACC_DATA_TYPE, M0, N0, c);
+
+ // Reset accumulators
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
+
+#if _IWEI_HEIGHT <= 5
+ LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else // _IWEI_HEIGHT <= 5
+ for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor (TILE A)
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+ TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+ // Load tile from the weights tensor (TILE B)
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+ // Optimized path for STRIDE_X == 1
+ // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
+ {
+#if _IWEI_WIDTH <= 16
+#define DOT_DATA_TYPE SRC_DATA_TYPE
+#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
+
+ // Optimized path for the dot instruction
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+ ACC_DATA_TYPE offset_a = 0;
+ ACC_DATA_TYPE offset_b = 0;
+
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ x0[0].s[xk] = a[xk + m0].s[n0];
+ y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+ })
+ DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+ c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+#else // _IWEI_WIDTH <= 16
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
+ })
+#endif // _IWEI_WIDTH <= 16
+ })
+ })
+ }
+#if _IWEI_HEIGHT <= 5
+ )
+#endif // _IWEI_HEIGHT <= 5
+
+#if _IWEI_WIDTH <= 16
+ T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+#endif // _IWEI_WIDTH <= 16
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ // Load bias
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+ T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+
+ // Quantize the tile
+ TILE(DST_DATA_TYPE, M0, N0, cq);
+ T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+
+ // Perform activation
+ T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ if(x_cond)
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE_PARTIAL(N0, PARTIAL_N0)
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+ else
+ {
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+ VSTORE(N0)
+ (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+ })
+ }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..a23e943fab
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+#if defined(IM2COL_3X3) || defined(IM2COL_9X9)
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ * @name IM2COL1X9_NHWC_STORE
+ *
+ * @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE Data type of @p DATA
+ * @param[in] SRC_DEPTH Input channel size / depth
+ * @param[in] DATA Value variable base name
+ * @param[in] ROW The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ const bool at_channel_boundary = get_global_id(0) == 0; \
+ if(at_channel_boundary) \
+ { \
+ IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ } \
+ else \
+ { \
+ IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+#endif // defined(IM2COL_3X3) || defined(IM2COL_9X9)
+
+#if defined(IM2COL_3X3)
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int3 offset = 0;
+
+ // Clamp xi
+ int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+ // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+ xi_offset *= (int3)src_stride_y;
+
+ // Out-of-bound condition for X
+ int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+ // yi == 0
+ // Clamp yi
+ // yi_coord is casted to unsigned int in order to use just a min() operation
+ // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+ // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+ // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+ // also causes y_cond (y padding condition) to be satisfied
+ yi_coord = yi - (int)PAD_TOP;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 1
+ // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+ yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with zeros
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 2
+ // Clamp yi_coord
+ yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // Store in a boundary-aware way to avoid padding
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_3X3)
+
+#if defined(IM2COL_9X9)
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \
+ \
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+ })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+ })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int8 offset0 = 0;
+ int offset1 = 0;
+
+ // Clamp xi
+ int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+ int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+ xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+ xi_offset0 *= (int8)src_stride_y;
+ xi_offset1 *= (int)src_stride_y;
+
+ // Out-of-bound condition for X
+ int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+ int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ IM2COL1x9(0);
+ IM2COL1x9(1);
+ IM2COL1x9(2);
+ IM2COL1x9(3);
+ IM2COL1x9(4);
+ IM2COL1x9(5);
+ IM2COL1x9(6);
+ IM2COL1x9(7);
+ IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_9X9)
+
+#if defined(IM2COL_GENERIC)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (yo % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (yo / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ const int stride_x = ch * sizeof(DATA_TYPE);
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + stride_x + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + stride_x + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int i = 0;
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ // Clamp yi_coord
+ int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+ yi_coord = clamp(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+ // Out-of-bound condition for Y
+ int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+ {
+ // Clamp xi_coord
+ int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+ xi_coord = clamp(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+ // Out-of-bound condition for X
+ int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ // Replace with PAD_VALUE if the value is out-of-bound
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+ const bool at_channel_boundary = get_global_id(0) == 0;
+ if(at_channel_boundary)
+ {
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+ }
+ else // at_channel_boundary
+#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+ {
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+ }
+ i++;
+ }
+ }
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_GENERIC) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
new file mode 100644
index 0000000000..aa719bfef0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution 2d indirect buffer.
+ *
+ * @note This kernel only works for unit batch_size
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The kernel width must be passed at compile time using -DWEI_CONV_WIDTH (e.g. -DWEI_CONV_WIDTH=9)
+ * @note The spatial dimensions of the source tensor used by conv2d must be passed at compile time using -DSRC_CONV_WIDTH and -DSRC_CONV_HEIGHT (e.g. -DSRC_CONV_WIDTH=96, -DSRC_CONV_HEIGHT=64)
+ * @note The width dimension of the destination tensor produced by conv2d must be passed at compile time using -DDST_CONV_WIDTH (e.g. -DDST_CONV_WIDTH=96)
+ * @note The tensor type ("BUFFER" only) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ *
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: INT32
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void indirect_convolution_address_precalculation(
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE))
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Note: WIDTH = M0 x KernelWidth x KernelHeight
+
+ // m index
+ const int mi = x % M0;
+ // Kernel index
+ const int ki = x / M0;
+ // Kernel width coordinate
+ const int xk = ki % WEI_CONV_WIDTH;
+ // kernel height coordinate
+ const int yk = ki / WEI_CONV_WIDTH;
+
+ TILE(DST_DATA_TYPE, 1, 1, xi);
+ TILE(DST_DATA_TYPE, 1, 1, yi);
+ TILE(DST_DATA_TYPE, 1, 1, my);
+
+ const int mout = y * M0;
+
+ xi[0].s[0] = ((mout + mi) % DST_CONV_WIDTH) * STRIDE_X;
+ yi[0].s[0] = ((mout + mi) / DST_CONV_WIDTH) * STRIDE_Y;
+ xi[0].s[0] -= PAD_LEFT;
+ yi[0].s[0] -= PAD_TOP;
+
+ const int x_s = xi[0].s[0] + xk;
+ const int y_s = yi[0].s[0] + yk;
+ my[0].s[0] = x_s + y_s * SRC_CONV_WIDTH;
+ my[0].s[0] = my[0].s[0] + z * (int)(SRC_CONV_WIDTH * SRC_CONV_HEIGHT);
+ my[0].s[0] = select(-1, my[0].s[0], x_s >= 0);
+ my[0].s[0] = select(-1, my[0].s[0], x_s < SRC_CONV_WIDTH);
+ my[0].s[0] = select(-1, my[0].s[0], y_s >= 0);
+ my[0].s[0] = select(-1, my[0].s[0], y_s < SRC_CONV_HEIGHT);
+
+ VSTORE(1)
+ (my[0].s[0], 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DST_DATA_TYPE) + y * dst_stride_y + z * dst_stride_z));
+}
+#endif // defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+
+#if defined(INDIRECT_CONVOLUTION_NHWC)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The vector length used for loading the values from the indirect buffer should be passed at compile time using -DIND_BUFF_VEC_SIZE (e.g. -DIND_BUFF_VEC_SIZE=4)
+ * @note The activation function to fuse and corresponding A and B values should be passed at compile time using -DACTIVATION_TYPE, -DA_VAL, and -DB_VAL
+ * (e.g. -DFUNCTION_TYPE=lu_brelu_op, -DA_VAL=3.0, and -DB_VAL=1.0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, and 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] off_img (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE
+ * @param[in] off_ptr Pointer to the indirect buffer tensor. Supported data type: INT32
+ * @param[in] off_stride_y Stride of the indirect buffer tensor in Y dimension (in bytes)
+ * @param[in] off_stride_z Stride of the indirect buffer tensor in Z dimension (in bytes)
+ * @param[in] off_stride_w Stride of the indirect buffer tensor in W dimension (in bytes)
+ * @param[in] off_c The size of the channels dimension of the indirect buffer tensor
+ * @param[in] off_w The size of the width dimension of the indirect buffer tensor
+ * @param[in] off_h The size of the height dimension of the indirect buffer tensor
+ * @param[in] off_n The size of the batches dimension of the indirect buffer tensor
+ * @param[in] off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[out] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_c The size of the channels dimension of the weights tensor
+ * @param[in] wei_w The size of the width dimension of the weights tensor
+ * @param[in] wei_h The size of the height dimension of the weights tensor
+ * @param[in] wei_n The size of the batches dimension of the weights tensor
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void indirect_convolution_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_RO_T(off, OFF_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ off_offset_first_element_in_bytes += get_global_id(1) * off_stride_y;
+ off_offset_first_element_in_bytes += bout * off_stride_z;
+
+ // Initialize the accumulators
+ TILE(DST_DATA_TYPE, M0, N0, c);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
+
+ for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+ {
+ TILE(int, 1, IND_BUFF_VEC_SIZE, my);
+ T_LOAD(int, 1, IND_BUFF_VEC_SIZE, OFF_TENSOR_TYPE, off, i * M0, 0, 1, 0, my);
+
+ int ck = 0;
+ for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+ {
+ TILE(SRC_DATA_TYPE, M0, K0, a);
+ TILE(WEI_DATA_TYPE, N0, K0, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = 0.0;
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = 0.0;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+ }
+
+ // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+ // Left-over accumulations
+ for(; ck < _ISRC_CHANNELS; ++ck)
+ {
+ TILE(SRC_DATA_TYPE, M0, 1, a);
+ TILE(WEI_DATA_TYPE, N0, 1, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = 0.0;
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = 0.0;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+ }
+#endif // defined(LEFTOVER_LOOP)
+ }
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+ // Apply activation
+ T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+
+ const bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, c, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IY_MULTIPLIER
+}
+#endif // defined(INDIRECT_CONVOLUTION_NHWC)
diff --git a/src/core/CL/cl_kernels/nhwc/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
new file mode 100644
index 0000000000..7e35e161c8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(WIDTH_SIZE)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Offset computation
+ const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+
+ // Address computation
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = 0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = SQCVT_SAT(KAPPA);
+
+ const int left_slice = max((int)0, (int)x_offs - (int)RADIUS);
+ const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
+
+ for(int i = left_slice; i <= right_slice; ++i)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
+
+ STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES) && defined(DIM1_SIZE)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Offset computation
+ const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ const int current_cols = get_global_id(1);
+ const int current_rows = get_global_id(2);
+
+ // Address computation
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = 0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = SQCVT_SAT(KAPPA);
+
+ const int first_col = max(0, current_cols - (int)RADIUS);
+ const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+
+#if defined(IN_MAP_2D)
+ const int first_row = max(0, current_rows - (int)RADIUS);
+ const int last_row = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+ for(int j = first_row; j <= last_row; ++j)
+ {
+#else // defined(IN_MAP_2D)
+ const int j = current_rows;
+#endif /* defined(IN_MAP_2D) */
+ for(int i = first_col; i <= last_col; ++i)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+#if defined(IN_MAP_2D)
+ }
+#endif /* defined(IN_MAP_2D) */
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
+
+ STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(NUM_SLICES) && defined(DIM1_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..86c33499e2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *std_addr = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+ const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+ const TYPE curr_std = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
+
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+ TYPE res0 = (data - curr_mean) / curr_std;
+
+ STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..7bc3c15a63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *std_addr = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+ curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+ curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+ data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+ // Perform normalization
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+ const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+ STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
new file mode 100644
index 0000000000..4e5481d1db
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v Stride of the source tensor in V dimension (in bytes)
+ * @param[in] input_step_v input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v Stride of the destination tensor in V dimension (in bytes)
+ * @param[in] output_step_v output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc(
+ TENSOR5D_DECLARATION(input),
+ TENSOR5D_DECLARATION(output))
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+ // The depth size dimension and the batch size dimension are collapsed over the height dimension
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+ int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+ int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+ output_stride_w + idx_out_n * output_stride_v;
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = INITIAL_VALUE;
+
+ int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+ int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+ // The start of width to consider in calculation should exclude padding
+ int pool_x_s = max((int)0, -idx_in_w);
+ // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+ int pool_z_s = max((int)0, -idx_in_d);
+ int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+ // The filter size with all padding in all directions considered.
+ int filter_size = pool_z_e * pool_y_e * pool_x_e;
+
+ // The end of width to consider in calculation should exclude PAD_X
+ pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+ pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+ pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+#if defined(EXCLUDE_PADDING)
+ filter_size = (pool_z_e - pool_z_s) * (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+ // Global pooling path
+ for(int z = 0; z < POOL_SIZE_Z; ++z)
+ {
+ int depth_offset_src = (z + idx_in_d) * input_stride_w;
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+ int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+ int width_offset_src = (x + idx_in_w) * input_stride_y;
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+ for(int z = pool_z_s; z < pool_z_e; ++z)
+ {
+ int depth_offset_src = (z + idx_in_d) * input_stride_w;
+ for(int y = pool_y_s; y < pool_y_e; ++y)
+ {
+ int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+ for(int x = pool_x_s; x < pool_x_e; ++x)
+ {
+ int width_offset_src = (x + idx_in_w) * input_stride_y;
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
+#if defined(FP_MIXED_PRECISION)
+ // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+ data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src)),
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif // defined(POOL_L2)
+ res0 = POOL_OP(res0, data0);
+ }
+ }
+ }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+
+
+ // Store result
+#if defined(QUANTIZED)
+ STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#elif defined(FP_MIXED_PRECISION)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+ STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
new file mode 100644
index 0000000000..abf0db9d07
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \
+ { \
+ const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+ const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \
+ res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
+ }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are QASYMM8_SIGNED, QASYMM8
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED, QASYMM8
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v Stride of the source tensor in V dimension (in bytes)
+ * @param[in] input_step_v input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v Stride of the destination tensor in V dimension (in bytes)
+ * @param[in] output_step_v output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc_quantized(
+ TENSOR5D_DECLARATION(input),
+ TENSOR5D_DECLARATION(output))
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be shrunk to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+ // The depth size dimension and the batch size dimension are collapsed over the height dimension
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+ int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+ int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+ output_stride_w + idx_out_n * output_stride_v;
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = INITIAL_VALUE;
+
+ int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+ int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+ // The start of width to consider in calculation should exclude padding
+ int pool_x_s = max((int)0, -idx_in_w);
+ // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+ int pool_z_s = max((int)0, -idx_in_d);
+ int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ int filter_size = pool_z_e * pool_y_e * pool_x_e;
+#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+ // The end of width to consider in calculation should exclude PAD_X
+ pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+ pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+ pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+ for(int z = pool_z_s; z < pool_z_e; ++z)
+ {
+ int depth_offset_src = (z + idx_in_d) * input_stride_w;
+ for(int y = pool_y_s; y < pool_y_e; ++y)
+ {
+ int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+ for(int x = pool_x_s; x < pool_x_e; ++x)
+ {
+ int width_offset_src = (x + idx_in_w) * input_stride_y;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data;
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+ data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+ res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ }
+ }
+ }
+
+#if defined(POOL_AVG)
+ res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+ // If batch size != 1, the batch size dimension is collapsed over the height dimension
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+ int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
+ int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+ output_stride_w;
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = INITIAL_VALUE;
+
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+ int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else // defined(EXCLUDE_PADDING)
+ int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ // Global pooling path
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+#pragma unroll 8
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ for(int y = pool_y_s; y < pool_y_e; ++y)
+ {
+#pragma unroll 8
+ for(int x = pool_x_s; x < pool_x_e; ++x)
+ {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
+#if defined(FP_MIXED_PRECISION)
+ // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+ data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif // defined(POOL_L2)
+ res0 = POOL_OP(res0, data0);
+ }
+ }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+ // Store result
+#if defined(FP_MIXED_PRECISION)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+ STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+ ,
+ TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+ int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+ // If batch size != 1, the batch size dimension is collapsed over the height dimension
+ int idx_out_h = get_global_id(2) % DST_HEIGHT;
+ int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //SRC_BATCH_SIZE != 1
+ int idx_out_h = get_global_id(2);
+ int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+ output_stride_w;
+
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+ int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+ int x0 = pool_x_s + idx_in_w;
+ int y0 = pool_y_s + idx_in_h;
+ int x1 = pool_x_e - 1 + idx_in_w;
+ int y1 = pool_y_e - 1 + idx_in_h;
+
+ REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+ // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+ data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+ data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+ data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+ data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+ if(filter_size != 4)
+ {
+ SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+ SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+ SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+ SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+ // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+ data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+ data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+ data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+ data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+ }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+ data1 *= data1;
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = data0;
+ res0 = POOL_OP(res0, data1);
+ res0 = POOL_OP(res0, data2);
+ res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else // !defined(EXCLUDE_PADDING)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+ // Store result
+#if defined(FP_MIXED_PRECISION)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+ STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+ // This part is used to return the index of the maximum value
+ // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+ // note: Batch dimension does not contribute in the offset contribution
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ base_index = (uint)idx_out_c;
+
+ base_index += VEC_OFFS(uint, VEC_SIZE);
+
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+ index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+ index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+ index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+ __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+ indices_stride_w;
+
+ // Store result
+ STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..46268a4a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \
+ { \
+ const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+ const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \
+ res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE)); \
+ }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+ int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+ // If batch size != 1, the batch size dimension is collapsed over the height dimension
+ int idx_out_h = get_global_id(2) % DST_HEIGHT;
+ int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+ int idx_out_h = get_global_id(2);
+ int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
+
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
+
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = INITIAL_VALUE;
+
+ for(int y = pool_y_s; y < pool_y_e; ++y)
+ {
+ for(int x = pool_x_s; x < pool_x_e; ++x)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data;
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+ data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+ res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+ }
+ }
+
+#if defined(POOL_AVG)
+ res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ // Store result
+ STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
new file mode 100644
index 0000000000..a340b0b8a2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \
+ ({ \
+ int offset = zo / (int)SRC_DEPTH; \
+ xi = xo * (int)STRIDE + offset % (int)STRIDE; \
+ yi = yo * (int)STRIDE + offset / (int)STRIDE; \
+ zi = zo % SRC_DEPTH; \
+ })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int xo = get_global_id(1);
+ int yo = get_global_id(2);
+ int zo = get_global_id(0);
+ int xi, yi, zi;
+
+ CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+ int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
new file mode 100644
index 0000000000..e071b0f192
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(SCALE_NEAREST_NEIGHBOUR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x The scale value to apply on the source width
+ * @param[in] scale_y The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_nearest_neighbour_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ const float scale_x,
+ const float scale_y)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ float xi_f = (xo * scale_x);
+ float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+ float xi_f = ((xo + 0.5f) * scale_x);
+ float yi_f = ((yo + 0.5f) * scale_y);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+#ifdef ALIGN_CORNERS
+ xi_f = round(xi_f);
+ yi_f = round(yi_f);
+#endif // ALIGN_CORNERS
+
+ const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+ const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
+
+ TILE(SRC_DATA_TYPE, 1, N0, in00);
+
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+
+ TILE(uint, 1, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
+}
+#endif /* SCALE_NEAREST_NEIGHBOUR */
+
+#if defined(SCALE_BILINEAR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @note In case of QASYMM8, the following extra information must be passed at compile time:
+ * - The source offset e.g. -DOFFSET=4
+ * - The source scale e.g. -DSCALE=4
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x The scale value to apply on the source width
+ * @param[in] scale_y The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_bilinear_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ const float scale_x,
+ const float scale_y)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ float xi_f = (xo * scale_x);
+ float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+ float xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+ float yi_f = ((yo + 0.5f) * scale_y - 0.5f);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+ const int xi = (int)floor(xi_f);
+ const int yi = (int)floor(yi_f);
+
+ TILE(SRC_DATA_TYPE, 1, N0, in00);
+ TILE(SRC_DATA_TYPE, 1, N0, in01);
+ TILE(SRC_DATA_TYPE, 1, N0, in10);
+ TILE(SRC_DATA_TYPE, 1, N0, in11);
+
+ // Initialize the tiles to CONSTANT_VALUE
+ in00[0].v = CONSTANT_VALUE;
+ in01[0].v = CONSTANT_VALUE;
+ in10[0].v = CONSTANT_VALUE;
+ in11[0].v = CONSTANT_VALUE;
+
+#ifndef BORDER_MODE_REPLICATE
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
+#else // BORDER_MODE_REPLICATE
+ const int xi0 = clamp(xi, 0, (int)src_w - 1);
+ const int yi0 = clamp(yi, 0, (int)src_h - 1);
+ const int xi1 = clamp(xi + 1, 0, (int)src_w - 1);
+ const int yi1 = clamp(yi + 1, 0, (int)src_h - 1);
+
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
+#endif // BORDER_MODE_REPLICATE
+
+ TILE(DST_DATA_TYPE, 1, N0, out);
+
+#if defined(IS_FLOATING_POINT)
+ const SRC_DATA_TYPE a = (SRC_DATA_TYPE)(xi_f - (float)xi);
+ const SRC_DATA_TYPE b = (SRC_DATA_TYPE)(1.f - a);
+ const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
+ const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
+
+ // Calculate the output
+ out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
+#else // defined(IS_FLOATING_POINT)
+
+ const float a = (xi_f - (float)xi);
+ const float b = (1.f - a);
+ const float a1 = (yi_f - (float)yi);
+ const float b1 = (1.f - a1);
+
+ out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
+ (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+ (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
+ (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
+ VEC_DATA_TYPE(DST_DATA_TYPE, N0));
+#endif // defined(IS_FLOATING_POINT)
+
+ TILE(uint, 1, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
+}
+#endif /* SCALE_BILINEAR */
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
new file mode 100644
index 0000000000..695bd4c217
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
+ * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+ TENSOR4D_DECLARATION(input),
+ IMAGE_DECLARATION(paddings),
+ VECTOR_DECLARATION(block_shape),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+ Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
+ const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+ const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
+ const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+ int block_x = *((__global int *)vector_offset(&block, 0));
+ int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - pad_left_x;
+ const int in_y = pos_y - pad_left_y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int block_x = BLOCK_SHAPE_X;
+ int block_y = BLOCK_SHAPE_Y;
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - PAD_LEFT_X;
+ const int in_y = pos_y - PAD_LEFT_Y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..10aac6d5fb
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0) % r;
+
+ const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+ const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
new file mode 100644
index 0000000000..1393537283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the transposed convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The transposed convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The transposed convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The tensor type (currently only "BUFFER" is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note If bias exists, the compile time argument -DHAS_BIAS should be passed
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1
+ * - N0 = 1, 2, 3, 4, 8, 16
+ * - K0 = 1, 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels (IFM) dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels (OFM) dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes)
+ * @param[in] wei_c The size of the channels (IFM) dimension of the weights tensor
+ * @param[in] wei_w The size of the width dimension of the weights tensor
+ * @param[in] wei_h The size of the height dimension of the weights tensor
+ * @param[in] wei_n The size of the batches (OFM) dimension of the weights tensor
+ * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16)
+ * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void transposed_convolution_nhwc(
+ TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+ TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // .v = access the whole vector (OpenCL vector)
+ // .s[x] = access the vector element at position x (scalar access)
+ TILE(int, 1, M0, xi);
+ TILE(int, 1, M0, yi);
+ TILE(int, 1, M0, xu);
+ TILE(int, 1, M0, yu);
+
+ // Convert the linear index to coordinate
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ xu[0].s[i] = ((mout + i) % _IDST_WIDTH) - PAD_LEFT;
+ yu[0].s[i] = ((mout + i) / _IDST_WIDTH) - PAD_TOP;
+ xi[0].s[i] = ceil(xu[0].s[i] / (float)STRIDE_X);
+ yi[0].s[i] = ceil(yu[0].s[i] / (float)STRIDE_Y);
+ })
+
+ // Initialize the accumulators
+ TILE(ACC_DATA_TYPE, M0, N0, c);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ c[i].v = 0;
+ })
+
+ // Flipped indices
+ const int x_start = _IWEI_WIDTH - (xi[0].s[0] * STRIDE_X - xu[0].s[0]) - 1;
+ const int y_start = _IWEI_HEIGHT - (yi[0].s[0] * STRIDE_Y - yu[0].s[0]) - 1;
+
+ for(int yk = y_start, yi_step = 0; yk >= 0; yk -= STRIDE_Y, ++yi_step)
+ {
+ for(int xk = x_start, xi_step = 0; xk >= 0; xk -= STRIDE_X, ++xi_step)
+ {
+ const int weights_y = cout * _IY_MULTIPLIER + yk * _IWEI_WIDTH + xk;
+
+ TILE(int, 1, M0, my);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ int x_s = xi[0].s[i] + xi_step;
+ int y_s = yi[0].s[i] + yi_step;
+ my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+ my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+ my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+ my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+ my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+ my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+ })
+
+ int ck = 0;
+ for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+ {
+ TILE(SRC_DATA_TYPE, M0, K0, a);
+ TILE(WEI_DATA_TYPE, N0, K0, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+ // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+ }
+
+ // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+ // Left-over accumulations
+ for(; ck < _ISRC_CHANNELS; ++ck)
+ {
+ TILE(SRC_DATA_TYPE, M0, 1, a);
+ TILE(WEI_DATA_TYPE, N0, 1, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = ZERO_VALUE;
+ })
+
+ // Load tile from the src tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, BUFFER, src, ck, src_stride_y, my, a);
+
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+ // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+ // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+ T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+ }
+#endif // defined(LEFTOVER_LOOP)
+ }
+ }
+
+#if defined(IS_QUANTIZED)
+ const int total_pixels = floor((1 + y_start / (float)STRIDE_Y)) * floor(1 + x_start / (float)STRIDE_X);
+
+ T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (total_pixels * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+#endif // defined(IS_QUANTIZED)
+
+#if defined(HAS_BIAS)
+ TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+ T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+ TILE(DST_DATA_TYPE, M0, N0, cq);
+
+ // Quantize the tile
+ T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/nhwc/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
new file mode 100644
index 0000000000..74b9674a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: All
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
+ const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+ src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+ dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
new file mode 100644
index 0000000000..45fbc1b641
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp) \
+ ({ \
+ out.s0 = -tmp.s0 / 36.f; \
+ out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f; \
+ out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f; \
+ out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \
+ out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \
+ out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+ out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+ out.s7 = tmp.s6; \
+ })
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
+ *
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ // Row 0
+ DATA_TYPE out00, out01, out02, out03, out04, out05;
+ out00 = (w00) / 16.f;
+ out01 = (-w00 - w01 - w02) / 24.f;
+ out02 = (-w00 + w01 - w02) / 24.f;
+ out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
+ out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
+ out05 = (w02) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ DATA_TYPE out10, out11, out12, out13, out14, out15;
+ out10 = (-w00 - w10 - w20) / 24.f;
+ out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
+ out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
+ out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+ out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+ out15 = (-w02 - w12 - w22) / 6.f;
+
+ // Row 2
+ DATA_TYPE out20, out21, out22, out23, out24, out25;
+ out20 = (-w00 + w10 - w20) / 24.f;
+ out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
+ out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
+ out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+ out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+ out25 = (-w02 + w12 - w22) / 6.f;
+
+ // Row 3
+ DATA_TYPE out30, out31, out32, out33, out34, out35;
+ out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
+ out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+ out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+ out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+ out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+ out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
+
+ // Row 4
+ DATA_TYPE out40, out41, out42, out43, out44, out45;
+ out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
+ out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+ out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+ out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+ out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+ out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
+
+ // Row 5
+ DATA_TYPE out50, out51, out52, out53, out54, out55;
+ out50 = (w20) / 4.f;
+ out51 = (-w20 - w21 - w22) / 6.f;
+ out52 = (-w20 + w21 - w22) / 6.f;
+ out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
+ out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
+ out55 = (w22);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int x0 = get_global_id(2); // idx filter
+ int y0 = get_global_id(0); // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ // 36 channels for 3x3 kernels
+ // 6 channels for 3x1 or 1x3 kernels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out10;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out11;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out12;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out13;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = w00;
+ out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+ out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+ out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+ out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+ out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+ out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+ out0.s7 = w04;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+ out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+ out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+ out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+ (w04 + w14 + w24 + w34 + w44)) / 405.f;
+ out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+ (w04 + w14 + w24 + w34 + w44)) / 405.f;
+ out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+ (w04 + w14 + w24 + w34 + w44)) / 810.f;
+ out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+ (w04 + w14 + w24 + w34 + w44)) / 810.f;
+ out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+ out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+ out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+ out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+ (w04 - w14 + w24 - w34 + w44)) / 405.f;
+ out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+ (w04 - w14 + w24 - w34 + w44)) / 405.f;
+ out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+ (w04 - w14 + w24 - w34 + w44)) / 810.f;
+ out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+ (w04 - w14 + w24 - w34 + w44)) / 810.f;
+ out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+ out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+ (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+ out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+ (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+ out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+ * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+ out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+ * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+ out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+ out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+ out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+ out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+ (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+ out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+ (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+ out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+ * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+ out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+ * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+ out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+ out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+ out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+ out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+ (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+ out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+ (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+ out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+ * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+ out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+ * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+ (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+ (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+ out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+
+ // Row 6
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+ out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+ out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+ (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+ out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+ (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+ out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+ * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+ out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+ * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+ (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+ (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+ out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+
+ // Row 7
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+ out7.s0 = w40;
+ out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+ out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+ out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+ out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+ out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+ out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+ out7.s7 = w44;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int x0 = get_global_id(2); // idx filter
+ int y0 = get_global_id(0); // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp = 0.0f;
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+
+ out0.s0 = -w00 / 36.0f;
+ out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+ out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+ out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+ out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+ out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+ out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+ out0.s7 = w06;
+
+ out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+
+ tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+ tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+ tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+ tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+ tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+ tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+ tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+ OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+
+ tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+ tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+ tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+ tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+ tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+ tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+ tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+ OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+
+ tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+ tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+ tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+ tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+ tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+ tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+ tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+ OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+
+ tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+ tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+ tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+ tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+ tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+ tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+ tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+ OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+
+ tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+ tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+ tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+ tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+ tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+ tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+ tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+ OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+ // Row 6
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+
+ tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+ tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+ tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+ tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+ tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+ tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+ tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+ OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+ // Row 7
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+
+ tmp.s0 = w60;
+ tmp.s1 = w61;
+ tmp.s2 = w62;
+ tmp.s3 = w63;
+ tmp.s4 = w64;
+ tmp.s5 = w65;
+ tmp.s6 = w66;
+
+ OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int x0 = get_global_id(2); // idx filter
+ int y0 = get_global_id(0); // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ const int SRC_DIM_Z)
+{
+ winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ SRC_DIM_Z);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..7341336b92
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = tmp.s2 - (DATA_TYPE)4.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = tmp.s1 - (DATA_TYPE)4.25f * tmp.s3 + tmp.s5; \
+ comm_fact.s2 = (DATA_TYPE)2.5f * tmp.s3; \
+ comm_fact.s3 = (DATA_TYPE)0.5f * tmp.s1 + (DATA_TYPE)2.f * tmp.s5 - comm_fact.s2; \
+ comm_fact.s4 = (DATA_TYPE)0.25f * tmp.s2 - (DATA_TYPE)1.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = (DATA_TYPE)4.f * tmp.s2 + tmp.s6 - (DATA_TYPE)5.f * tmp.s4; \
+ comm_fact.s6 = (DATA_TYPE)2.f * tmp.s1 + (DATA_TYPE)0.5f * tmp.s5 - comm_fact.s2; \
+ \
+ out.s0 = tmp.s0 - tmp.s6 + (DATA_TYPE)5.25f * tmp.s4 - (DATA_TYPE)5.25f * tmp.s2; \
+ out.s1 = comm_fact.s0 + comm_fact.s1; \
+ out.s2 = comm_fact.s0 - comm_fact.s1; \
+ out.s3 = comm_fact.s3 + comm_fact.s4; \
+ out.s4 = comm_fact.s4 - comm_fact.s3; \
+ out.s5 = comm_fact.s5 + comm_fact.s6; \
+ out.s6 = comm_fact.s5 - comm_fact.s6; \
+ out.s7 = tmp.s7 - tmp.s1 + (DATA_TYPE)5.25f * tmp.s3 - (DATA_TYPE)5.25f * tmp.s5; \
+ })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = (DATA_TYPE)36.0f * tmp.s2 - (DATA_TYPE)13.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = (DATA_TYPE)36.0f * tmp.s1 - (DATA_TYPE)13.0f * tmp.s3 + (DATA_TYPE)1.0f * tmp.s5; \
+ comm_fact.s2 = (DATA_TYPE)9.0f * tmp.s2 - (DATA_TYPE)10.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s3 = (DATA_TYPE)18.0f * tmp.s1 - (DATA_TYPE)20.0f * tmp.s3 + (DATA_TYPE)2.0f * tmp.s5; \
+ comm_fact.s4 = (DATA_TYPE)4.0f * tmp.s2 - (DATA_TYPE)5.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = (DATA_TYPE)12.0f * tmp.s1 - (DATA_TYPE)15.0f * tmp.s3 + (DATA_TYPE)3.0f * tmp.s5; \
+ out.s0 = -(DATA_TYPE)36.0f * tmp.s0 + (DATA_TYPE)49.0f * tmp.s2 + -(DATA_TYPE)14.0f * tmp.s4 + tmp.s6; \
+ out.s1 = comm_fact.s0 - comm_fact.s1; \
+ out.s2 = comm_fact.s0 + comm_fact.s1; \
+ out.s3 = comm_fact.s2 - comm_fact.s3; \
+ out.s4 = comm_fact.s2 + comm_fact.s3; \
+ out.s5 = comm_fact.s4 - comm_fact.s5; \
+ out.s6 = comm_fact.s4 + comm_fact.s5; \
+ out.s7 = -(DATA_TYPE)36.0f * tmp.s1 + (DATA_TYPE)0.0f * tmp.s2 + (DATA_TYPE)49.0f * tmp.s3 - (DATA_TYPE)14.0f * tmp.s5 + tmp.s7; \
+ })
+
+#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 6, N0, in);
+ TILE(DATA_TYPE, 6, N0, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 6, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ TILE(DATA_TYPE, 6, N0, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v *= (DATA_TYPE)4.0f;
+ })
+
+ com[0].v = in[2].v - (DATA_TYPE)4.f * in[0].v;
+ com[1].v = in[3].v - (DATA_TYPE)4.f * in[1].v;
+ com[2].v = in[4].v - (DATA_TYPE)4.f * in[2].v;
+ com[3].v = in[5].v - (DATA_TYPE)4.f * in[3].v;
+ com[4].v = in[3].v - in[1].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = in[4].v - in[2].v;
+
+ out[0].v = com[2].v - com[0].v;
+ out[1].v = com[2].v + com[1].v;
+ out[2].v = com[2].v - com[1].v;
+ out[3].v = com[5].v + com[4].v;
+ out[4].v = com[5].v - com[4].v;
+ out[5].v = com[3].v - com[1].v;
+
+ TILE(uint, 6, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 36, N0, in);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 6, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 6, N0, com);
+ TILE(DATA_TYPE, 36, N0, tmp);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+ com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+ com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+ com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+ com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
+ tmp[i + 0 * 6].v = com[2].v - com[0].v;
+ tmp[i + 1 * 6].v = com[2].v + com[1].v;
+ tmp[i + 2 * 6].v = com[2].v - com[1].v;
+ tmp[i + 3 * 6].v = com[5].v + com[4].v;
+ tmp[i + 4 * 6].v = com[5].v - com[4].v;
+ tmp[i + 5 * 6].v = com[3].v - com[1].v;
+ })
+
+ TILE(DATA_TYPE, 36, N0, out);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ com[0].v = tmp[i * 6 + 2].v - (DATA_TYPE)4.f *tmp[i * 6 + 0].v;
+ com[1].v = tmp[i * 6 + 3].v - (DATA_TYPE)4.f *tmp[i * 6 + 1].v;
+ com[2].v = tmp[i * 6 + 4].v - (DATA_TYPE)4.f *tmp[i * 6 + 2].v;
+ com[3].v = tmp[i * 6 + 5].v - (DATA_TYPE)4.f *tmp[i * 6 + 3].v;
+ com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+ out[i * 6 + 0].v = com[2].v - com[0].v;
+ out[i * 6 + 1].v = com[2].v + com[1].v;
+ out[i * 6 + 2].v = com[2].v - com[1].v;
+ out[i * 6 + 3].v = com[5].v + com[4].v;
+ out[i * 6 + 4].v = com[5].v - com[4].v;
+ out[i * 6 + 5].v = com[3].v - com[1].v;
+ })
+
+ // Compute destination address
+ TILE(uint, 36, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 8, 1, in);
+ TILE(DATA_TYPE, 8, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 8, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 8, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ TILE(DATA_TYPE, 1, 8, com);
+
+ com[0].s[0] = in[2].v - (DATA_TYPE)4.25f * in[4].v + in[6].v;
+ com[0].s[1] = in[1].v - (DATA_TYPE)4.25f * in[3].v + in[5].v;
+ com[0].s[2] = (DATA_TYPE)0.5f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+ com[0].s[3] = (DATA_TYPE)0.25f * in[2].v - (DATA_TYPE)1.25f * in[4].v + in[6].v;
+ com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+ com[0].s[5] = (DATA_TYPE)2.0f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)0.5f * in[5].v;
+ out[0].s[0] = in[0].v - 5.25f * in[2].v + (DATA_TYPE)5.25f * in[4].v - in[6].v;
+ out[1].s[0] = com[0].s[0] + com[0].s[1];
+ out[2].s[0] = com[0].s[0] - com[0].s[1];
+ out[3].s[0] = com[0].s[3] + com[0].s[2];
+ out[4].s[0] = com[0].s[3] - com[0].s[2];
+ out[5].s[0] = com[0].s[4] + com[0].s[5];
+ out[6].s[0] = com[0].s[4] - com[0].s[5];
+ out[7].s[0] = -in[1].v + (DATA_TYPE)5.25f * in[3].v - (DATA_TYPE)5.25f * in[5].v + in[7].v;
+
+ TILE(uint, 8, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, 1, in);
+ TILE(DATA_TYPE, 64, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 8, 8, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
+ com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
+ com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
+ com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+ com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+ com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+ com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+ })
+
+ TILE(DATA_TYPE, 8, 8, tmp);
+ tmp[0].v = com[6].v;
+ tmp[1].v = com[0].v + com[1].v;
+ tmp[2].v = com[0].v - com[1].v;
+ tmp[3].v = com[2].v + com[3].v;
+ tmp[4].v = com[2].v - com[3].v;
+ tmp[5].v = com[4].v + com[5].v;
+ tmp[6].v = com[4].v - com[5].v;
+ tmp[7].v = com[7].v;
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[0] = tmp[i].s[2] - (DATA_TYPE)4.25f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[1] = tmp[i].s[1] - (DATA_TYPE)4.25f * tmp[i].s[3] + tmp[i].s[5];
+ com[0].s[2] = (DATA_TYPE)0.5f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+ com[0].s[3] = (DATA_TYPE)0.25f * tmp[i].s[2] - (DATA_TYPE)1.25f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[5] = (DATA_TYPE)2.0f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)0.5f * tmp[i].s[5];
+ out[i * 8 + 0].s[0] = tmp[i].s[0] - (DATA_TYPE)5.25f * tmp[i].s[2] + (DATA_TYPE)5.25f * tmp[i].s[4] - tmp[i].s[6];
+ out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+ out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+ out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+ out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+ out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+ out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+ out[i * 8 + 7].s[0] = -tmp[i].s[1] + (DATA_TYPE)5.25f * tmp[i].s[3] - (DATA_TYPE)5.25f * tmp[i].s[5] + tmp[i].s[7];
+ })
+
+ TILE(uint, 64, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 8, 1, in);
+ TILE(DATA_TYPE, 8, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v *= (DATA_TYPE) - 36.0f;
+ })
+
+ TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+ com[0].s[0] = (DATA_TYPE)36.0f * in[2].v - (DATA_TYPE)13.0f * in[4].v + in[6].v;
+ com[0].s[1] = (DATA_TYPE)36.0f * in[1].v - (DATA_TYPE)13.0f * in[3].v + (DATA_TYPE)1.0f * in[5].v;
+ com[0].s[2] = (DATA_TYPE)9.0f * in[2].v - (DATA_TYPE)10.0f * in[4].v + in[6].v;
+ com[0].s[3] = (DATA_TYPE)18.0f * in[1].v - (DATA_TYPE)20.0f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+ com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+ com[0].s[5] = (DATA_TYPE)12.0f * in[1].v - (DATA_TYPE)15.0f * in[3].v + (DATA_TYPE)3.0f * in[5].v;
+ out[0].s[0] = (DATA_TYPE) - 36.0f * in[0].v + (DATA_TYPE)49.0f * in[2].v + -(DATA_TYPE)14.0f * in[4].v + in[6].v;
+ out[1].s[0] = com[0].s[0] - com[0].s[1];
+ out[2].s[0] = com[0].s[0] + com[0].s[1];
+ out[3].s[0] = com[0].s[2] - com[0].s[3];
+ out[4].s[0] = com[0].s[2] + com[0].s[3];
+ out[5].s[0] = com[0].s[4] - com[0].s[5];
+ out[6].s[0] = com[0].s[4] + com[0].s[5];
+ out[7].s[0] = -(DATA_TYPE)36.0f * in[1].v + (DATA_TYPE)0.0f * in[2].v + (DATA_TYPE)49.0f * in[3].v - (DATA_TYPE)14.0f * in[5].v + in[7].v;
+
+ TILE(uint, 8, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, 1, in);
+ TILE(DATA_TYPE, 64, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 8, 8, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+ com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+ com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+ com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+ com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+ })
+
+ TILE(DATA_TYPE, 8, 8, tmp);
+ tmp[0].v = com[6].v;
+ tmp[1].v = com[0].v - com[1].v;
+ tmp[2].v = com[0].v + com[1].v;
+ tmp[3].v = com[2].v - com[3].v;
+ tmp[4].v = com[2].v + com[3].v;
+ tmp[5].v = com[4].v - com[5].v;
+ tmp[6].v = com[4].v + com[5].v;
+ tmp[7].v = com[7].v;
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[0] = (DATA_TYPE)36.0f * tmp[i].s[2] - (DATA_TYPE)13.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[1] = (DATA_TYPE)36.0f * tmp[i].s[1] - (DATA_TYPE)13.0f * tmp[i].s[3] + (DATA_TYPE)1.0f * tmp[i].s[5];
+ com[0].s[2] = (DATA_TYPE)9.0f * tmp[i].s[2] - (DATA_TYPE)10.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[3] = (DATA_TYPE)18.0f * tmp[i].s[1] - (DATA_TYPE)20.0f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+ com[0].s[4] = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[5] = (DATA_TYPE)12.0f * tmp[i].s[1] - (DATA_TYPE)15.0f * tmp[i].s[3] + (DATA_TYPE)3.0f * tmp[i].s[5];
+ out[i * 8 + 0].s[0] = (DATA_TYPE) - 36.0f * tmp[i].s[0] + (DATA_TYPE)49.0f * tmp[i].s[2] + -(DATA_TYPE)14.0f * tmp[i].s[4] + tmp[i].s[6];
+ out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+ out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+ out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+ out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+ out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+ out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+ out[i * 8 + 7].s[0] = -(DATA_TYPE)36.0f * tmp[i].s[1] + (DATA_TYPE)0.0f * tmp[i].s[2] + (DATA_TYPE)49.0f * tmp[i].s[3] - (DATA_TYPE)14.0f * tmp[i].s[5] + tmp[i].s[7];
+ })
+
+ TILE(uint, 64, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+#endif // defined(NHWC)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..9eb995fbb2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_HEIGHT The source tensor's height
+ * @param[in] _IDST_WIDTH The destination tensor's width
+ * @param[in] _IDST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int _ISRC_HEIGHT,
+ const int _IDST_WIDTH,
+ const int _IDST_HEIGHT)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ TILE(DATA_TYPE, 8, N0, in);
+ TILE(DATA_TYPE, 2, N0, out);
+ TILE(uint, 8, 1, src_indirect_y);
+
+ // Calculate the indirect Y for the source tensor
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 8 channels to compose the 8x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // Compute out0 and out01
+ out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+ out[1].v = -in[1].v + in[2].v - (DATA_TYPE)2.f * in[3].v + (DATA_TYPE)2.0f * in[4].v - (DATA_TYPE)3.0f * in[5].v + (DATA_TYPE)3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+ T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 2,
+ {
+ int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 2,
+ {
+ int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(DATA_TYPE, 16, N0, tmp);
+ TILE(uint, 64, 1, src_indirect_y);
+
+ // Calculate the indirect Y for the source tensor
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 64 channels to compose the 8x8 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+ tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - (DATA_TYPE)2 * in[24 + i].v + (DATA_TYPE)2 * in[32 + i].v + (DATA_TYPE) - 3 * in[40 + i].v + (DATA_TYPE)3 * in[48 + i].v + in[56 + i].v;
+ })
+
+ // Compute the 2x2 output tile
+ LOOP_UNROLLING(int, i, 0, 1, 2,
+ {
+ out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+ out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - (DATA_TYPE)2 * tmp[6 + i].v + (DATA_TYPE)2 * tmp[8 + i].v - (DATA_TYPE)3 * tmp[10 + i].v + (DATA_TYPE)3 * tmp[12 + i].v + tmp[14 + i].v;
+ })
+
+#if defined(HAS_BIAS)
+ // Add bias
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 2,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 2,
+ {
+ int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+ dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] dst_size Size of the destination tensor, minus the last padding
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 6, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(uint, 6, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // Compute out00, out01, out02 and out03
+ out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+ out[1].v = in[1].v - in[2].v + (DATA_TYPE)2.0f * in[3].v - (DATA_TYPE)2.0f * in[4].v;
+ out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * in[3].v + (DATA_TYPE)4.0f * in[4].v;
+ out[3].v = in[1].v - in[2].v + (DATA_TYPE)8.0f * in[3].v - (DATA_TYPE)8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+ dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+ dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Calculate the indirect Y for the source tensor
+ TILE(DATA_TYPE, 36, N0, in);
+ TILE(DATA_TYPE, 4, N0, tmp);
+ TILE(uint, 36, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ tmp[0].v = in[6 + i].v + in[12 + i].v;
+ tmp[1].v = in[6 + i].v - in[12 + i].v;
+ tmp[2].v = in[18 + i].v + in[24 + i].v;
+ tmp[3].v = in[18 + i].v - in[24 + i].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ in[i].v = in[i].v + tmp[0].v + tmp[2].v;
+ in[6 + i].v = tmp[3].v + tmp[1].v;
+ in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+ in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+ })
+
+ // Compute the output tile
+ TILE(DATA_TYPE, 16, N0, out);
+
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
+ tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
+ tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
+ tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+ out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+ out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+ out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+ })
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 16, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+ dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else // defined(IS_BATCHED)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ TILE(DATA_TYPE, 8, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(DATA_TYPE, 4, N0, tmp);
+ TILE(uint, 8, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+ // "in" contains 1x8 or 8x1 tile here
+ T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // A^T * in, and in this degenerate case out consists of 1 column/row
+ tmp[0].v = in[1].v - in[2].v;
+ tmp[1].v = (DATA_TYPE)2.0f * (in[3].v - in[4].v);
+ tmp[2].v = (DATA_TYPE)2.0f * (in[5].v + in[6].v);
+ tmp[3].v = in[3].v + in[4].v;
+ out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + (DATA_TYPE)4.0f * tmp[2].v;
+ out[1].v = tmp[0].v + tmp[1].v + (DATA_TYPE)4.0f * (in[5].v - in[6].v);
+ out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * tmp[3].v + tmp[2].v;
+ out[3].v = tmp[0].v + (DATA_TYPE)4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+ dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+ dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Calculate the indirect Y for the source tensor
+ TILE(DATA_TYPE, 64, N0, in);
+ TILE(DATA_TYPE, 6, N0, tmp);
+ TILE(uint, 64, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // "in" here is 8x8 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // A^T * in
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ tmp[0].v = in[8 + i].v + in[16 + i].v;
+ tmp[1].v = in[8 + i].v - in[16 + i].v;
+ tmp[2].v = in[24 + i].v + in[32 + i].v;
+ tmp[3].v = in[24 + i].v - in[32 + i].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ tmp[4].v = in[40 + i].v + in[48 + i].v;
+ tmp[4].v = tmp[4].v + tmp[4].v;
+ tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+ // 4x8 matrix as a result
+ in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+ in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+ in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+ in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+ })
+
+ // Compute the output tile
+ TILE(DATA_TYPE, 16, N0, out);
+
+ // in * A, with in = A^T * in as above
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+ tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+ tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+ tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+ tmp[4].v = tmp[4].v + tmp[4].v;
+ tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+ // 4x4 tile
+ out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+ out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+ out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+ out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+ })
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 16, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+ dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] SRC_HEIGHT The source tensor's height
+ * @param[in] DST_WIDTH The destination tensor's width
+ * @param[in] DST_HEIGHT The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size,
+ const int SRC_HEIGHT,
+ const int DST_WIDTH,
+ const int DST_HEIGHT)
+{
+ winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size,
+ SRC_HEIGHT,
+ DST_WIDTH,
+ DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)