181 files changed, 22572 insertions, 33973 deletions
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
deleted file mode 100644
index a09caf5dc5..0000000000
--- a/src/core/CL/cl_kernels/absdiff.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the absolute difference of two input images.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void absdiff(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
deleted file mode 100644
index 9e37830f1b..0000000000
--- a/src/core/CL/cl_kernels/accumulate.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function accumulates an input image into output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- */
-__kernel void accumulate(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu))
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    uchar16 in_data   = vload16(0, input.ptr);
-    short16 accu_data = vload16(0, (__global short *)accu.ptr);
-
-    // Perform accumulation
-    short16 res = add_sat(convert_short16(in_data), accu_data);
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}
-
-/** This function accumulates a weighted value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
- */
-__kernel void accumulate_weighted(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const float alpha)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    const float16 in_data   = convert_float16(vload16(0, input.ptr));
-    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
-
-    // Calculate weighted accumulation
-    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
-
-    // Store result
-    vstore16(res, 0, accu.ptr);
-}
-
-/** This function accumulates a squared value from an input image to an output image.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
- * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
- */
-__kernel void accumulate_squared(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(accu),
-    const uint shift)
-{
-    // Get pixels pointer
-    Image input = CONVERT_TO_IMAGE_STRUCT(input);
-    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
-
-    // Load data
-    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
-    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
-
-    // Calculate squared accumulation
-    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
-
-    // Store result
-    vstore16(res, 0, (__global short *)accu.ptr);
-}
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 91d7197889..02faae2369 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,8 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
 #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
 #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
 #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)           \
+    (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+            (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
 #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -69,6 +73,10 @@
 // Linear Activation
 #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
+// GELU Activation
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+
 // Identity Activation
 #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
 
diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index a32e4e94a3..c758ff1278 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,21 +51,26 @@ inline TYPE lu_brelu_op(TYPE x)
 // Hard Swish Activation
 inline TYPE hard_swish_op(TYPE x)
 {
-    return  (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f));
+    return (x * ((min(max((TYPE)(x + (TYPE)3.f), (TYPE)0.f), (TYPE)6.f)) * (TYPE)0.166666667f));
+}
+
+inline TYPE identiy_op(TYPE x)
+{
+    return x;
 }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x)  ACTIVATION_OP2(op, x)
 
 #if defined(S1_VAL) && defined(S2_VAL)
 #if defined(O1_VAL) && defined(O2_VAL)
 #define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
     ({                                                                                            \
         data = ACTIVATION_OP(act, data);                                                          \
-        \
+                                                                                                  \
         VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
-        \
+                                                                                                  \
         fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
     })
@@ -73,17 +78,14 @@ inline TYPE hard_swish_op(TYPE x)
 #define PERFORM_ACTIVATION_QUANT(act, data)                             \
     ({                                                                  \
         data = ACTIVATION_OP(act, data);                                \
-        \
+                                                                        \
         VEC_DATA_TYPE(float, VEC_SIZE)                                  \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
-        \
+                                                                        \
         fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
     })
 #endif /* defined(O1_VAL) && defined(O2_VAL) */
 #else  /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
-    ({                                      \
-        data = ACTIVATION_OP(act, data);    \
-    })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
 #endif /* defined(S1_VAL) && defined(S2_VAL) */
diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/arg_min_max.cl
deleted file mode 100644
index 5184e0c5b8..0000000000
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) isgreater(x, y)
-#define ISLESS(x, y) isless(x, y)
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
-#define ISLESS(x, y) select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(ARG_MAX)
-#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
-#elif defined(ARG_MIN)
-#define CONDITION_TO_USE(x, y) ISLESS(x, y)
-#else // !(defined(ARG_MAX) || defined(ARG_MIN))
-#error "Unsupported reduction operation!"
-#endif // defined(ARG_MAX)
-
-#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
-#if defined(WIDTH)
-#if defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) < * (input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-    idx_sel       = (in.s01234567 <= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MIN)
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) > *(input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-    idx_sel       = (in.s01234567 >= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX)
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
- * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
- *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] prev_res_ptr                              (Optional) Pointer to previous results tensor. Supported data types: U32/S32
- * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X dimension (in bytes)
- * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y dimension (in bytes)
- * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element in the previous results tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: U32/S32
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
- */
-__kernel void arg_min_max_x(
-    IMAGE_DECLARATION(src),
-#if defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(prev_res),
-#endif // defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(partial_res),
-    __local DATA_TYPE_OUTPUT *local_results)
-{
-#if defined(PREV_OUTPUT)
-    Image src      = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
-#else  // !defined(PREV_OUTPUT)
-    Image src                      = CONVERT_TO_IMAGE_STRUCT(src);
-#endif // defined(PREV_OUTPUT)
-    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
-
-    unsigned int lsize = get_local_size(0);
-    unsigned int lid   = get_local_id(0);
-
-    const uint     x_idx                 = get_global_id(0);
-    const uint     y_idx                 = get_global_id(1);
-    const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
-
-    for(unsigned int y = 0; y < get_local_size(1); ++y)
-    {
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#else  // defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-        local_results[lid]         = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Looking for the next highest power of 2 (maximum value of lsize is 8)
-        unsigned int middle = lsize - 1;
-        middle |= middle >> 1;
-        middle |= middle >> 2;
-        middle += 1;
-        // Perform parallel reduction
-        for(unsigned int i = middle; i > 0; i >>= 1)
-        {
-            if(lid < i && lid + i < lsize)
-            {
-                DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
-                DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
-#if defined(ARG_MAX)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
-#else  // defined(ARG_MIN)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        if(lid == 0)
-        {
-            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-        }
-    }
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(output))
-{
-    Image src    = CONVERT_TO_IMAGE_STRUCT(src);
-    Image output = CONVERT_TO_IMAGE_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    indx = 0;
-    for(unsigned int y = 1; y < HEIGHT; ++y)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-        indx      = select(indx, y, cond_conv);
-        res       = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    indx = 0;
-    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-        indx      = select(indx, z, cond_conv);
-        res       = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(DEPTH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    indx = 0;
-    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-        VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-        cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-        indx      = select(indx, w, cond_conv);
-        res       = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
-#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
deleted file mode 100644
index 8a71985b02..0000000000
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
deleted file mode 100644
index 89cbe4440e..0000000000
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) rsqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-
-/** Apply batch normalization.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D out = in;
-#else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
-#ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
-
-    const int current_slice = get_global_id(2);
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-
-    res = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res                         = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-    // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Apply batch normalization on tensors with NHWC format.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_ptr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
-#ifndef USE_DEFAULT_BETA
-    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = 0;
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
-
-    res0 = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res0 = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
-    // beta is not zero, hence we need to perform the addition
-    res0 = ADD_OP(res0, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
-
-#if defined(DATA_TYPE) && defined(EPSILON)
-/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
- *
- * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
- * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
- *       For depthwise convolution weight do not pass DIM2
- * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
- * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
- * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
- * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
- * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
- * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
- * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
- * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
- * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
- * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
- * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
- * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
- * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
- * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
- * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
- * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
- */
-__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
-#if defined(BIAS)
-                                            VECTOR_DECLARATION(b),
-#endif // defined(BIAS)
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var)
-#ifndef IN_PLACE_W
-                                            ,
-                                            TENSOR3D_DECLARATION(w_fused)
-#endif // ifndef IN_PLACE_W
-#ifndef IN_PLACE_B
-                                            ,
-                                            VECTOR_DECLARATION(b_fused)
-#endif // ifndef IN_PLACE_B
-#if defined(BETA)
-                                            ,
-                                            VECTOR_DECLARATION(beta)
-#endif // defined(BETA)
-#if defined(GAMMA)
-                                            ,
-                                            VECTOR_DECLARATION(gamma)
-#endif // defined(GAMMA)
-                                           )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-#if defined(DIM2)
-    int c0 = z % DIM2;
-    int c1 = z / DIM2;
-#else // ! defined(DIM2)
-    int c0                                                                                    = 0;
-#if defined(NHWC)
-    int c1                                                                                    = x;
-#else  // defined(NHWC)
-    int c1 = z;
-#endif // defined(NHWC)
-#endif // defined(DIM2)
-
-    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
-    int v_offset = c1 * sizeof(DATA_TYPE);
-
-    DATA_TYPE w_old = 0.0f;
-    DATA_TYPE b_old = 0.0f;
-    DATA_TYPE w_new = 0.0f;
-    DATA_TYPE b_new = 0.0f;
-    DATA_TYPE gamma = 1.0f;
-    DATA_TYPE mean  = 0.0f;
-    DATA_TYPE var   = 1.0f;
-    DATA_TYPE beta  = 0.0f;
-
-    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
-    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
-    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
-
-#if defined(GAMMA)
-    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
-#endif // defined(GAMMA)
-
-    // Compute new weight
-    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
-
-#if defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
-#else  // defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
-#endif // defined(IN_PLACE_W)
-
-    // Compute bias
-#if !defined(DIM2) && defined(NHWC)
-    if(z == 0 && y == 0)
-#else  // !defined(DIM2) && defined(NHWC)
-    if(x == 0 && y == 0 && c0 == 0)
-#endif // !defined(DIM2) && defined(NHWC)
-    {
-#if defined(BIAS)
-        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
-#endif // defined(BIAS)
-#if defined(BETA)
-        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
-#endif // defined(BETA)
-
-        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
-
-#if defined(BIAS)
-
-#if defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
-#else  // defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // defined(IN_PLACE_B)
-
-#else // defined(BIAS)
-
-#ifndef IN_PLACE_B
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // ifndef IN_PLACE_B
-
-#endif // defined(BIAS)
-    }
-}
-#endif // defined(DATA_TYPE) && defined(EPSILON)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
deleted file mode 100644
index bcff8438db..0000000000
--- a/src/core/CL/cl_kernels/canny.cl
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 1 normalisation.
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L1(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
-
-    /* Calculate the gradient, using level 1 normalisation method */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
-    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h));
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
- *
- * @note The calculation of gradient uses level 2 normalisation
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
- * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
- * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
- * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
- */
-__kernel void combine_gradients_L2(
-    IMAGE_DECLARATION(src1),
-    IMAGE_DECLARATION(src2),
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle))
-{
-    // Construct images
-    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
-    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
-
-    // Load sobel horizontal and vertical values
-    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
-    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
-
-    /* Calculate the gradient, using level 2 normalisation method */
-    float4 m = sqrt(h * h + v * v);
-
-    /* Calculate the angle */
-    float4 p = 180.0f * atan2pi(v, h);
-
-    /* Remap angle to range [0, 256) */
-    p = select(p, p + 180.0f, p < 0.0f);
-
-    /* Store results */
-    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
-    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
-}
-
-#define EDGE 255
-#define NO_EDGE 0
-
-/** Array that holds the relative coordinates offset for the neighbouring pixels.
- */
-__constant short4 neighbours_coords[] =
-{
-    { -1, 0, 1, 0 },  // 0
-    { -1, -1, 1, 1 }, // 45
-    { 0, -1, 0, 1 },  // 90
-    { 1, -1, -1, 1 }, // 135
-};
-
-/** Perform non maximum suppression.
- *
- * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- *
- * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
- * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
- * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
- * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
- * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
- * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  lower_thr                             The low threshold
- */
-__kernel void suppress_non_maximum(
-    IMAGE_DECLARATION(grad),
-    IMAGE_DECLARATION(angle),
-    IMAGE_DECLARATION(non_max),
-    uint lower_thr)
-{
-    // Construct images
-    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
-    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
-    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
-
-    // Index
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    // Get gradient and angle
-    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
-    uchar an              = *((__global uchar *)angle.ptr);
-
-    // Early return if not greater than lower threshold
-    if(gradient <= lower_thr)
-    {
-        return;
-    }
-
-    // Divide the whole round into 4 directions
-    DATA_TYPE_OUT q_an;
-
-    if(an < 22.5f || an >= 157.5f)
-    {
-        q_an = 0;
-    }
-    else if(an < 67.5f)
-    {
-        q_an = 1;
-    }
-    else if(an < 112.5f)
-    {
-        q_an = 2;
-    }
-    else
-    {
-        q_an = 3;
-    }
-
-    // Find the two pixels in the perpendicular direction
-    short2       x_p = neighbours_coords[q_an].s02;
-    short2       y_p = neighbours_coords[q_an].s13;
-    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
-    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
-
-    if((gradient > g1) && (gradient > g2))
-    {
-        __global uchar *non_max_addr            = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y;
-        *((global DATA_TYPE_OUT *)non_max_addr) = gradient;
-    }
-}
-
-#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
-#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
-
-/** Check whether pixel is valid
- *
- * Skip the pixel if the early_test fails.
- * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
- *
- * @param[in] early_test Boolean condition based on the minv check and visited buffer check
- * @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
- * @param[in] x_cur      X-coordinate of current central pixel
- * @param[in] y_cur      Y-coordinate of current central pixel
- */
-#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
-    {                                                                                     \
-        if(!early_test)                                                                   \
-        {                                                                                 \
-            /* Number of elements in the local stack 1, points to next available entry */ \
-            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
-            \
-            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
-                goto pop_stack;                                                           \
-            \
-            /* The pixel that has already been recorded is ignored */                     \
-            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
-            {                                                                             \
-                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
-                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
-            }                                                                             \
-        }                                                                                 \
-    }
-
-/** Perform hysteresis.
- *
- * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
- *
- * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
- * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
- * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
- * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
- * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
- * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
- * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
- * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
- * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
- * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
- * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
- * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
- * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
- * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
- * @param[in]  low_thr                                        The lower threshold
- * @param[in]  up_thr                                         The upper threshold
- * @param[in]  width                                          The width of the image.
- * @param[in]  height                                         The height of the image
- */
-kernel void hysteresis(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(visited),
-    IMAGE_DECLARATION(recorded),
-    IMAGE_DECLARATION(l1_stack),
-    IMAGE_DECLARATION(l1_stack_counter),
-    uint low_thr,
-    uint up_thr,
-    int  width,
-    int  height)
-{
-    // Create images
-    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
-    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
-    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
-    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
-    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
-
-    // Index
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    // Load value
-    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
-
-    // If the pixel has already been marked as NO_EDGE, store that value in the output and return
-    if(val == NO_EDGE)
-    {
-        *offset(&out, x, y) = NO_EDGE;
-        return;
-    }
-
-    // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge
-    if(val <= up_thr)
-    {
-        return;
-    }
-
-    // Init local stack 2
-    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
-    int    L2_counter                          = 0;
-
-    // Perform recursive hysteresis
-    while(true)
-    {
-        // Get L1 stack pointer
-        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
-
-        // If the pixel has already been visited, proceed with the items in the stack instead
-        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
-        {
-            goto pop_stack;
-        }
-
-        // Set strong edge
-        *offset(&out, x, y) = EDGE;
-
-        // If it is the top of stack l2, we don't need check the surrounding pixels
-        if(L2_counter > (hysteresis_local_stack_L2 - 1))
-        {
-            goto pop_stack2;
-        }
-
-        // Points to the start of the local stack;
-        char c;
-
-        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
-        x_tmp;
-        uint4 v_tmp;
-
-        // Get direction pixel indices
-        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
-
-        // Check 8 pixels around for weak edges where low_thr < val <= up_thr
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
-
-        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
-        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
-        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
-        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
-        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
-
-#undef check_pixel
-
-pop_stack:
-        c = *((__global char *)offset(&l1_stack_counter, x, y));
-
-        if(c >= 1)
-        {
-            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
-            int2 l_c = convert_int2(l1_ptr[c - 1]);
-
-            // Push the current position into level 2 stack
-            stack_L2[L2_counter].x = x;
-            stack_L2[L2_counter].y = y;
-
-            x = l_c.x;
-            y = l_c.y;
-
-            L2_counter++;
-
-            continue;
-        }
-
-        if(L2_counter > 0)
-        {
-            goto pop_stack2;
-        }
-        else
-        {
-            return;
-        }
-
-pop_stack2:
-        L2_counter--;
-        x = stack_L2[L2_counter].x;
-        y = stack_L2[L2_counter].y;
-    };
-}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
deleted file mode 100644
index 550d52e9ea..0000000000
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function combines three planes to a single RGB image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGB888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
-                             data0.s1, data1.s1, data2.s1,
-                             data0.s2, data1.s2, data2.s2,
-                             data0.s3, data1.s3, data2.s3,
-                             data0.s4, data1.s4, data2.s4,
-                             data0.s5);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
-                             data1.s6, data2.s6, data0.s7,
-                             data1.s7, data2.s7, data0.s8,
-                             data1.s8, data2.s8, data0.s9,
-                             data1.s9, data2.s9, data0.sA,
-                             data1.sA);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
-                             data2.sB, data0.sC, data1.sC,
-                             data2.sC, data0.sD, data1.sD,
-                             data2.sD, data0.sE, data1.sE,
-                             data2.sE, data0.sF, data1.sF,
-                             data2.sF);
-    vstore16(out2, 0, dst.ptr + 32);
-}
-
-/** This function combines three planes to a single RGBA image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
- * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
- * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
- * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_RGBA8888(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(plane3),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar16 data1 = vload16(0, plane1.ptr);
-    uchar16 data2 = vload16(0, plane2.ptr);
-    uchar16 data3 = vload16(0, plane3.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
-                             data0.s1, data1.s1, data2.s1, data3.s1,
-                             data0.s2, data1.s2, data2.s2, data3.s2,
-                             data0.s3, data1.s3, data2.s3, data3.s3);
-    vstore16(out0, 0, dst.ptr);
-
-    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
-                             data0.s5, data1.s5, data2.s5, data3.s5,
-                             data0.s6, data1.s6, data2.s6, data3.s6,
-                             data0.s7, data1.s7, data2.s7, data3.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-
-    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
-                             data0.s9, data1.s9, data2.s9, data3.s9,
-                             data0.sA, data1.sA, data2.sA, data3.sA,
-                             data0.sB, data1.sB, data2.sB, data3.sB);
-    vstore16(out2, 0, dst.ptr + 32);
-
-    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
-                             data0.sD, data1.sD, data2.sD, data3.sD,
-                             data0.sE, data1.sE, data2.sE, data3.sE,
-                             data0.sF, data1.sF, data2.sF, data3.sF);
-    vstore16(out3, 0, dst.ptr + 48);
-}
-
-/** This function combines three planes to a single YUYV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_YUYV422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
-                             data0.s2, data1.s1, data0.s3, data2.s1,
-                             data0.s4, data1.s2, data0.s5, data2.s2,
-                             data0.s6, data1.s3, data0.s7, data2.s3);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
-                             data0.sA, data1.s5, data0.sB, data2.s5,
-                             data0.sC, data1.s6, data0.sD, data2.s6,
-                             data0.sE, data1.s7, data0.sF, data2.s7);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single UYUV image.
- *
- * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
- * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
- */
-__kernel void channel_combine_UYVY422(
-    IMAGE_DECLARATION(plane0),
-    IMAGE_DECLARATION(plane1),
-    IMAGE_DECLARATION(plane2),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
-    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
-    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
-    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data0 = vload16(0, plane0.ptr);
-    uchar8  data1 = vload8(0, plane1.ptr);
-    uchar8  data2 = vload8(0, plane2.ptr);
-
-    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
-                             data1.s1, data0.s2, data2.s1, data0.s3,
-                             data1.s2, data0.s4, data2.s2, data0.s5,
-                             data1.s3, data0.s6, data2.s3, data0.s7);
-    vstore16(out0, 0, dst.ptr);
-    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
-                             data1.s5, data0.sA, data2.s5, data0.sB,
-                             data1.s6, data0.sC, data2.s6, data0.sD,
-                             data1.s7, data0.sE, data2.s7, data0.sF);
-    vstore16(out1, 0, dst.ptr + 16);
-}
-
-/** This function combines three planes to a single NV12/NV21 image.
- *
- * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void channel_combine_NV(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-
-    // Create UV place
-    uchar8 data1 = vload8(0, src_plane1.ptr);
-    uchar8 data2 = vload8(0, src_plane2.ptr);
-
-#ifdef NV12
-    vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#elif defined(NV21)
-    vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
-#endif /* NV12 or NV21 */
-}
-
-/** This function combines three planes to a single YUV444 or IYUV image.
- *
- * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
- *
- * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
- * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
- * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
- * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
- * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
- * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
- * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
- * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
- * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
- * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
- * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
- * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
- * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
- * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
- * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
- * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
- * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
- * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
- * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
- * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
- * @param[in] height                                   Sub-sampled height
- */
-__kernel void copy_planes_3p(
-    IMAGE_DECLARATION(src_plane0),
-    IMAGE_DECLARATION(src_plane1),
-    IMAGE_DECLARATION(src_plane2),
-    IMAGE_DECLARATION(dst_plane0),
-    IMAGE_DECLARATION(dst_plane1),
-    IMAGE_DECLARATION(dst_plane2),
-    uint height)
-{
-    // Get pixels pointer
-    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
-    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
-    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
-    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
-    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
-    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
-
-    // Copy plane data
-    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-#ifdef YUV444
-    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#elif defined(IYUV)
-    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
-    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
-    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#endif /* YUV444 or IYUV */
-}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
deleted file mode 100644
index b64f24814e..0000000000
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function extracts a given channel from an RGB image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGB888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar8  data2 = vload8(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore4(data.s0369, 0, dst.ptr);
-    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
-#elif defined(CHANNEL_G)
-    vstore4(data.s147A, 0, dst.ptr);
-    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
-#elif defined(CHANNEL_B)
-    vstore4(data.s258B, 0, dst.ptr);
-    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
-}
-
-/** This function extracts a given channel from an RGBA image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_RGBA8888(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data  = vload16(0, src.ptr);
-    uchar16 data2 = vload16(0, src.ptr + 16);
-
-#ifdef CHANNEL_R
-    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
-#elif defined(CHANNEL_G)
-    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
-#elif defined(CHANNEL_B)
-    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
-#elif defined(CHANNEL_A)
-    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
-#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
-}
-
-/** This function extracts a given channel from an YUYV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_YUYV422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s159D, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s37BF, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an UYUV image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_UYVY422(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_Y
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_U)
-    vstore4(data.s048C, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore4(data.s26AE, 0, dst.ptr);
-#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV12 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV12(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given channel from an NV21 image.
- *
- * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
- * @warning Only channels UV can be extracted using this kernel.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void channel_extract_NV21(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 data = vload16(0, src.ptr);
-
-#ifdef CHANNEL_U
-    vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined(CHANNEL_V)
-    vstore8(data.s02468ACE, 0, dst.ptr);
-#endif /* CHANNEL_U or CHANNEL_V */
-}
-
-/** This function extracts a given plane from an multi-planar image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void copy_plane(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Copy plane data
-    vstore8(vload8(0, src.ptr), 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
deleted file mode 100644
index 9a87eb4af3..0000000000
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
-* Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
-
-// Check valid VEC_SIZES
-#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#error "Only vector sizes 4, 8 and 16 are supported"
-#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
-    ({                                                      \
-        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
-        uint r  = div_res * (y);                            \
-        mod_res = (x)-r;                                    \
-    })
-
-/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    uint curr_channel = 0; // channel id of input
-    uint batch_id     = 0; // batch id
-    uint group_id     = 0; // group id
-    uint channel_id   = 0; // channel id within the group
-
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
-
-    const uint x = get_global_id(0) * VEC_SIZE;
-    const uint y = get_global_id(1) * 2;
-    const uint z = channel_id * NUM_GROUPS + group_id;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
-    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    VSTORE(VEC_SIZE)
-    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
-}
-
-#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
-/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note This implementation is only defined for VEC_SIZE = 4
- * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
-    uint       channel_id0  = 0;
-    uint       channel_id1  = 0;
-    uint       channel_id2  = 0;
-    uint       channel_id3  = 0;
-    uint       group_id0    = 0;
-    uint       group_id1    = 0;
-    uint       group_id2    = 0;
-    uint       group_id3    = 0;
-    uint       y            = 0;
-    uint       batch_id     = 0;
-
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
-    DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
-    DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
-    DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
-
-    const uint x  = get_global_id(1) * 2;
-    const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
-    const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
-    const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
-    const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
-    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr                                                              = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
-}
-#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
deleted file mode 100644
index cbebc88668..0000000000
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ /dev/null
@@ -1,1911 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    uchar16 rgb_0 = vload16(0, in.ptr);
-    uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
-    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
-    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
-    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-    vstore16(rgba_2, 0, out.ptr + 32);
-    vstore16(rgba_3, 0, out.ptr + 48);
-}
-
-/** Convert an RGB888 image to U8
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: RGB888
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGB888_to_U8_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 16 pixels every time
-    const uchar16 rgb_0 = vload16(0, in.ptr);
-    const uchar16 rgb_1 = vload16(0, in.ptr + 16);
-    const uchar16 rgb_2 = vload16(0, in.ptr + 32);
-
-    //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
-    const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
-    const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
-    const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
-
-    const float16 rgb2u8_red_coef_bt709   = 0.2126f;
-    const float16 rgb2u8_green_coef_bt709 = 0.7152f;
-    const float16 rgb2u8_blue_coef_bt709  = 0.0722f;
-
-    //Computation of 16 greyscale values in float
-    const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
-
-    //Convert it to 16 grayscale uchar values
-    const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
-
-    vstore16(greyscale_u8_0, 0, out.ptr);
-}
-
-/** Convert an RGB888 image to RGBX8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void RGBA8888_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-    // handle 16 pixels every time
-    uchar16 rgba_0 = vload16(0, in.ptr);
-    uchar16 rgba_1 = vload16(0, in.ptr + 16);
-    uchar16 rgba_2 = vload16(0, in.ptr + 32);
-    uchar16 rgba_3 = vload16(0, in.ptr + 48);
-
-    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
-    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
-    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore16(rgb_1, 0, out.ptr + 16);
-    vstore16(rgb_2, 0, out.ptr + 32);
-}
-
-/** Convert a UYVY422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a UYVY422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void UYVY422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGB888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGB888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
-                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
-    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
-
-    vstore16(rgb_0, 0, out.ptr);
-    vstore8(rgb_1, 0, out.ptr + 16);
-}
-
-/** Convert a YUYV422 image to RGBX8888 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void YUYV422_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    // handle 8 pixels every time
-    uchar16 uyvy = vload16(0, in.ptr);
-
-    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
-    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
-    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
-
-    float8 red_coef_bt709    = (float8)(1.5748f);
-    float8 green_coef_bt709  = (float8)(-0.1873f);
-    float8 green_coef2_bt709 = (float8)(-0.4681f);
-    float8 blue_coef_bt709   = (float8)(1.8556f);
-    float8 lumav             = convert_float8(luma);
-
-    float8 f_r = red_coef_bt709 * convert_float8(cr);
-    float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr);
-    float8 f_b = blue_coef_bt709 * convert_float8(cb);
-
-    f_r += lumav;
-    f_g += lumav;
-    f_b += lumav;
-
-    uchar8 r_0 = convert_uchar8_sat_rtz(f_r);
-    uchar8 g_0 = convert_uchar8_sat_rtz(f_g);
-    uchar8 b_0 = convert_uchar8_sat_rtz(f_b);
-
-    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
-                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
-                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
-
-    vstore16(rgba_0, 0, out.ptr);
-    vstore16(rgba_1, 0, out.ptr + 16);
-}
-
-/** Convert a RGB image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGB888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/*
-    R'= Y' + 0.0000*U + 1.5748*V
-    G'= Y' - 0.1873*U - 0.4681*V
-    B'= Y' + 1.8556*U + 0.0000*V
-*/
-
-/** Convert an NV12 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert a RGB image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4_rtz(f_y);
-    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
-    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGB image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
- * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGB888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgb_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert a RGBA image to YUV444 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_YUV444_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time
-    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 4 pixel
-    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
-    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
-    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
-    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
-
-    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
-    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
-    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
-
-    short4 i_y = convert_short4(f_y);
-    short4 i_u = convert_short4(f_u) + (short4)(128);
-    short4 i_v = convert_short4(f_v) + (short4)(128);
-
-    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
-    vstore4(luma_0, 0, out_y.ptr);
-
-    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
-    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
-    vstore4(cb_0, 0, out_u.ptr);
-    vstore4(cr_0, 0, out_v.ptr);
-}
-
-/** Convert a RGBA image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
- * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
- *
- */
-__kernel void RGBA8888_to_NV12_bt709(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    vstore2(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a RGBA image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
- * No offset.
- *
- * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void RGBA8888_to_IYUV_bt709(
-    IMAGE_DECLARATION(rgba_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    // handle 4 pixels every time, two lines, each line for 2 pixels
-    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // Read 2 pixel of the first line
-    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
-    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
-    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
-    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
-
-    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
-    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
-    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
-
-    short2 i_y = convert_short2_rtz(f_y);
-    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
-    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_0, 0, out_y.ptr);
-
-    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-
-    // Read 2 pixel of the second line
-    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
-    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
-    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
-    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
-
-    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
-    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
-    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
-
-    i_y = convert_short2_rtz(f_y);
-    i_u = convert_short2_rtz(f_u) + (short2)(128);
-    i_v = convert_short2_rtz(f_v) + (short2)(128);
-
-    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
-    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
-    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
-    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
-                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
-    *out_u.ptr = cbcr.x;
-    *out_v.ptr = cbcr.y;
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV12_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert an NV12 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV12_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an NV12 image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void NV21_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_luma.ptr);
-    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
-    uchar4 cbcr   = vload4(0, in_uv.ptr);
-    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
-    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an NV21 image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
-                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
-    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
-                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an NV21 image to IYUV
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
- * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
- * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- */
-__kernel void NV21_to_IYUV_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(uv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar16 cbcr   = vload16(0, in_uv.ptr);
-    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
-    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore8(cb, 0, out_u.ptr);
-    vstore8(cr, 0, out_v.ptr);
-}
-
-/** Convert a UYVY image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void UYVY422_to_IYUV_bt709(
-    IMAGE_DECLARATION(uyvy_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 uyvy = vload16(0, in_uyvy.ptr);
-    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
-    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
-    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
-    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert a YUYV image to IYUV using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void YUYV422_to_IYUV_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv = vload16(0, in_yuyv.ptr);
-    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
-    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
-    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
-    vstore4(cb, 0, out_u.ptr);
-    vstore4(cr, 0, out_v.ptr);
-}
-
-/** Convert an IYUV image to RGB888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
- * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGB888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgb_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore4(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
-    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
-    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to RGB8888
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
- * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void IYUV_to_RGBA8888_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(rgba_output))
-{
-    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
-
-    // handle 8 pixels every time, two lines, each line for 4 pixels
-    uchar4 luma_0 = vload4(0, in_y.ptr);
-    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
-    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
-    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
-    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
-
-    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
-    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
-    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
-
-    float4 f_r = convert_float4(luma_0) + temp0;
-    float4 f_g = convert_float4(luma_0) + temp1;
-    float4 f_b = convert_float4(luma_0) + temp2;
-
-    uchar4 r_0 = convert_uchar4_sat_rtz(f_r);
-    uchar4 g_0 = convert_uchar4_sat_rtz(f_g);
-    uchar4 b_0 = convert_uchar4_sat_rtz(f_b);
-
-    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr);
-    vstore8(rgb_1, 0, out_rgb.ptr + 8);
-
-    f_r = convert_float4(luma_1) + temp0;
-    f_g = convert_float4(luma_1) + temp1;
-    f_b = convert_float4(luma_1) + temp2;
-
-    r_0 = convert_uchar4_sat_rtz(f_r);
-    g_0 = convert_uchar4_sat_rtz(f_g);
-    b_0 = convert_uchar4_sat_rtz(f_b);
-
-    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
-    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
-    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
-    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
-}
-
-/** Convert an IYUV image to YUV444
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
- * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
- * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
- * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
- * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
- * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
- * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
- *
- */
-__kernel void IYUV_to_YUV444_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(u_output),
-    IMAGE_DECLARATION(v_output))
-{
-    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
-    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb_src = vload8(0, in_u.ptr);
-    uchar8  cr_src = vload8(0, in_v.ptr);
-    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
-                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
-    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
-                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cb, 0, out_u.ptr);
-    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
-    vstore16(cr, 0, out_v.ptr);
-    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
-}
-
-/** Convert an IYUV image to NV12
- *
- * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
- * No offset.
- *
- * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
- * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
- * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
- * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
- * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
- * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
- * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
- * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
- * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
- * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
- * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void IYUV_to_NV12_bt709(
-    IMAGE_DECLARATION(luma_input),
-    IMAGE_DECLARATION(u_input),
-    IMAGE_DECLARATION(v_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
-    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
-    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 32 pixels every time, two lines, each line for 16 pixels
-    uchar16 luma_0 = vload16(0, in_y.ptr);
-    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
-    uchar8  cb     = vload8(0, in_u.ptr);
-    uchar8  cr     = vload8(0, in_v.ptr);
-    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
-                               cr.s6, cb.s7, cr.s7);
-
-    vstore16(luma_0, 0, out_y.ptr);
-    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
-    vstore16(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a YUYV image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- * No offset.
- *
- * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
- * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
- * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
- * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
- * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
- * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
- *
- */
-__kernel void YUYV422_to_NV12_bt709(
-    IMAGE_DECLARATION(yuyv_input),
-    IMAGE_DECLARATION(luma_output),
-    IMAGE_DECLARATION(uv_output))
-{
-    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
-    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
-    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
-
-    // handle 16 pixels every time, each line 8 pixels
-    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
-    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr);
-
-    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
-    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
-    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
-    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
-
-    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}
-
-/** Convert a UYVY image to NV12 using BT709 color space
- *
- * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
- * No offset.
- *
- * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
- * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
- * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
- * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
- * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
- * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
- * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
- * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
- *
- */
-__kernel void UYVY422_to_NV12_bt709(
-    IMAGE_DECLARATION(input_uyvy),
-    IMAGE_DECLARATION(luma),
-    IMAGE_DECLARATION(uv))
-{
-    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
-    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
-    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
-
-    // handle 16 pixels every time, each line 8 pixels
-    const uchar16 uyvy_t = vload16(0, in.ptr);
-    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
-
-    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
-    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
-
-    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
-    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
-    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
-    vstore8(cbcr, 0, out_uv.ptr);
-}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
index bc2c99b6c8..a04556a1ed 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
index 66261019ab..38ee00b17a 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/common/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
new file mode 100644
index 0000000000..413fcf5333
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
+
+#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
+#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
+#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(WIDTH)
+
+#if defined(ARG_MAX)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) >= (y))
+#define VECTOR_PREDICATE(x, y) ((x) > (y))
+#define SCALAR_SELECT_OP(x, y) ((x) > (y)) ? (x) : (y);
+#elif defined(ARG_MIN)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) <= (y))
+#define VECTOR_PREDICATE(x, y) ((x) < (y))
+#define SCALAR_SELECT_OP(x, y) ((x) < (y)) ? (x) : (y);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_2(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 2) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) res)
+{
+    if( VECTOR_PREDICATE_EQ(in.s0,in.s1) )
+    {
+        *min_max_val  = in.s0;
+        *min_max_idx  = res.s0;
+    }
+    else
+    {
+        *min_max_val  = in.s1;
+        *min_max_idx  = res.s1;
+    }
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_4(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 4) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 2)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01, in.s23);
+    in.s01      = select(in.s23, in.s01, idx_sel);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_8(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 8) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s0123, in.s4567);
+    in.s0123      = select(in.s4567, in.s0123, idx_sel);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_16(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 16) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01234567, in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) ));
+    idx_sel.s0123 = VECTOR_PREDICATE(in.s0123, in.s4567) || (in.s0123 == in.s4567 && CONVERT(((res.s0123 < res.s4567)), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+
+
+inline void scalar_compute_global_min_max(DATA_TYPE in_val, int idx, DATA_TYPE *out_min_max_val, DATA_TYPE_OUTPUT *out_idx)
+{
+#if defined(ARG_MAX)
+    if(in_val > *out_min_max_val)
+#else  // defined(ARG_MAX)
+    if(in_val < *out_min_max_val)
+#endif // defined(ARG_MAX)
+    {
+        *out_min_max_val = in_val;
+        *out_idx         = idx;
+    }
+}
+
+#if VEC_SIZE > 1
+#if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_16(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_8(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 4 // # elif VEC_SIZE == 8
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_4(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_2(min_max_val,min_max_idx,in,res)
+#else // elif VEC_SIZE == 2
+    #error "Not supported"
+#endif // #if VEC_SIZE == 16
+
+inline VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE) init_idx_vector()
+{
+#if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7 };
+#elif VEC_SIZE == 4 // elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3 };
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1 };
+#else  // elif VEC_SIZE == 2
+#error "Not supported"
+#endif // #if VEC_SIZE == 16
+    return vidx;
+}
+#endif // VEC_SIZE > 1
+
+/** This kernel performs reduction on x-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The data type used for the comparing indexe must be passed at compile type using -DCOND_DATA_TYPE: e.g -DCOND_DATA_TYPE=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    __global DATA_TYPE *input_addr         = (__global DATA_TYPE *)(input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y);
+    __global DATA_TYPE_OUTPUT *output_addr = (__global DATA_TYPE_OUTPUT *)(output_ptr + output_offset_first_element_in_bytes + get_global_id(1) * output_stride_y);
+
+    DATA_TYPE        final_value = input_addr[0];
+    DATA_TYPE_OUTPUT final_idx   = 0;
+
+#if VEC_SIZE > 1
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = init_idx_vector();
+
+    int x = 0;
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (input_addr + x));
+        DATA_TYPE        local_min_max_value;
+        DATA_TYPE_OUTPUT local_min_max_idx;
+
+        VECTORIZED_OP(&local_min_max_value, &local_min_max_idx, vals, vidx);
+        local_min_max_idx += x;
+        scalar_compute_global_min_max(local_min_max_value, local_min_max_idx, &final_value, &final_idx);
+    }
+#endif // VEC_SIZE > 1
+
+#if(WIDTH % VEC_SIZE)
+    LOOP_UNROLLING(int, j, 0, 1, WIDTH % VEC_SIZE,
+    {
+        scalar_compute_global_min_max(*(input_addr + j + x), j + x, &final_value, &final_idx);
+    })
+#endif // (WIDTH % VEC_SIZE)
+
+    output_addr[0] = final_idx;
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    const int x_offs            = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH) && !defined(BATCH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH)  && !defined(BATCH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
+                                  (get_global_id(2) / DEPTH) * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
+                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0                                                                                    = 0;
+#if defined(NHWC)
+    int c1                                                                                    = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else  // !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
index b88b3bca22..e142c1d275 100644
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,14 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
 /** This function computes the bitwise OR of two input images.
  *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
  * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
@@ -49,18 +55,32 @@ __kernel void bitwise_or(
     IMAGE_DECLARATION(in2),
     IMAGE_DECLARATION(out))
 {
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
 
-    uchar16 in_a = vload16(0, in1.ptr);
-    uchar16 in_b = vload16(0, in2.ptr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a | in_b;
 
-    vstore16(in_a | in_b, 0, out.ptr);
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 
 /** This function computes the bitwise AND of two input images.
  *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
  * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
@@ -85,18 +105,32 @@ __kernel void bitwise_and(
     IMAGE_DECLARATION(in2),
     IMAGE_DECLARATION(out))
 {
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
 
-    uchar16 in_a = vload16(0, in1.ptr);
-    uchar16 in_b = vload16(0, in2.ptr);
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
 
-    vstore16(in_a & in_b, 0, out.ptr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a & in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 
 /** This function computes the bitwise XOR of two input images.
  *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
  * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
@@ -121,17 +155,31 @@ __kernel void bitwise_xor(
     IMAGE_DECLARATION(in2),
     IMAGE_DECLARATION(out))
 {
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
 
-    uchar16 in_a = vload16(0, in1.ptr);
-    uchar16 in_b = vload16(0, in2.ptr);
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
 
-    vstore16(in_a ^ in_b, 0, out.ptr);
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a ^ in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 
-/** This function computes the bitwise NOT of an image.
+/** This function computes the bitwise NOT of an images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -147,13 +195,24 @@ __kernel void bitwise_xor(
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void bitwise_not(
-    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(in1),
     IMAGE_DECLARATION(out))
 {
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
 
-    uchar16 in_data = vload16(0, in.ptr);
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
 
-    vstore16(~in_data, 0, out.ptr);
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = ~in_a;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
+
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
index a9b0496a6e..f2e9cb0ed0 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
 
-/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas.
  *
  * @attention The following variables must be passed at compile time:
  * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
index 9e5cee55f4..c1d45a56b9 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
 
-/** Perform a padded copy of input tensor to the output tensor for quantized data types. Padding values are defined at compile time
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas for quantized data types.
  *
  * @attention The following variables must be passed at compile time:
  * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/common/cast.cl
index 046b26df01..036a683ec7 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/common/cast.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,14 @@
 #include "helpers.h"
 
 #ifdef SATURATE
-#if defined(IS_DATA_TYPE_FLOAT)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#else /* defined(IS_DATA_TYPE_FLOAT) */
 #define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
-#endif /* defined(IS_DATA_TYPE_FLOAT) */
-#else  /* SATURATE */
+#else /* SATURATE */
 #define CONVERT_DOWN(x, type) CONVERT(x, type)
 #endif /* SATURATE */
 
 #define CONVERT_UP(x, type) CONVERT(x, type)
 
-/** This function performs a down-scaling depth conversion.
+/** This function performs a down-casting
  *
  * @attention For QSYMM8_PER_CHANNEL -> QASYMM8, it is user's responsibility to keep track of the quantization info.
  *
@@ -61,12 +56,10 @@
  * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  shift                             The integer shift amount value. Supported data types: S32
  */
-__kernel void convert_depth_down(
+__kernel void cast_down(
     TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
-    const int shift)
+    TENSOR3D_DECLARATION(out))
 {
     int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
 
@@ -87,12 +80,12 @@ __kernel void convert_depth_down(
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
     VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
 
-/** This function performs a up-scaling depth conversion.
+/** This function performs a up-casting
  *
  * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
@@ -115,12 +108,10 @@ __kernel void convert_depth_down(
  * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  shift                             The integer shift amount value. Supported data types: S32
  */
-__kernel void convert_depth_up(
+__kernel void cast_up(
     TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
-    const int shift)
+    TENSOR3D_DECLARATION(out))
 {
     int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
 
@@ -137,7 +128,7 @@ __kernel void convert_depth_up(
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
     VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift;
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
index 59c2d8a3aa..4dc005fd43 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@ __kernel void col2im(
     TENSOR4D_DECLARATION(dst))
 {
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
 
     const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
     const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
diff --git a/src/core/CL/cl_kernels/common/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
new file mode 100644
index 0000000000..00bb491f85
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
+
+#ifdef IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name##_quantized
+#else // IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name
+#endif // IS_QUANTIZED
+
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
+ *
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    int dst_x = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+
+#if VEC_SIZE_IN1 == 1
+    int in1_x = 0;
+#else // VEC_SIZE_IN1 == 1
+    int in1_x = dst_x;
+#endif // VEC_SIZE_IN1 == 1
+
+#if VEC_SIZE_IN2 == 1
+    int in2_x = 0;
+#else // VEC_SIZE_IN2 == 1
+    int in2_x = dst_x;
+#endif // VEC_SIZE_IN2 == 1
+
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    in1_ptr += in1_offset_first_element_in_bytes + z * in1_stride_z + y * in1_stride_y + in1_x * sizeof(DATA_TYPE);
+    in2_ptr += in2_offset_first_element_in_bytes + z * in2_stride_z + y * in2_stride_y + in2_x * sizeof(DATA_TYPE);
+    out_ptr += out_offset_first_element_in_bytes + z * out_stride_z + y * out_stride_y + dst_x * sizeof(uchar);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_ptr);
+
+    // Calculate and store result
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(int, VEC_SIZE) in_a_i32 = CONVERT(in_a, VEC_DATA_TYPE(int, VEC_SIZE));
+    VEC_DATA_TYPE(int, VEC_SIZE) in_b_i32 = CONVERT(in_b, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    VEC_DATA_TYPE(float, VEC_SIZE) in_a_fp = CONVERT(in_a_i32 - OFFSET_IN1, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN1;
+    VEC_DATA_TYPE(float, VEC_SIZE) in_b_fp = CONVERT(in_b_i32 - OFFSET_IN2, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN2;
+#else // IS_QUANTIZED
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a_fp = in_a;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b_fp = in_b;
+#endif // IS_QUANTIZED
+
+#if VEC_SIZE == 1
+    uchar res0 = (uchar)select(0, 255, OP(in_a_fp, in_b_fp));
+#else // VEC_SIZE == 1
+    VEC_DATA_TYPE(uchar, VEC_SIZE) res0 = CONVERT(OP(in_a_fp, in_b_fp), VEC_DATA_TYPE(uchar, VEC_SIZE));
+#endif // VEC_SIZE == 1
+
+    STORE_VECTOR_SELECT(res, uchar, out_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
index d2e65408dc..dc7210a4c4 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,19 +43,17 @@ inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset,
 #if defined(DATA_TYPE)
 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
-#if defined(DEPTH) && defined(ELEMENT_SIZE)
-#if defined(INPUT1_WIDTH)
+#if defined(ELEMENT_SIZE)
 
 #define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define SEQ VEC_OFFS(int, VEC_SIZE)
 
+#if defined(CONCATENATE_WIDTH_X2)
 /** This kernel concatenates two input tensors into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
  *
  * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -87,11 +85,15 @@ inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset,
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
  */
 __kernel void concatenate_width_x2(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH)
 {
     // Calculate input indices
     const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
@@ -125,17 +127,15 @@ __kernel void concatenate_width_x2(
 
     STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
+#endif // defined(CONCATENATE_WIDTH_X2)
 
-#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+#if defined(CONCATENATE_WIDTH_X4)
 /** This kernel concatenates four input tensors into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
- * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
- * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
  *
  * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -187,13 +187,21 @@ __kernel void concatenate_width_x2(
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
+ * @param[in]  INPUT2_WIDTH                       Second input tensor width
+ * @param[in]  INPUT3_WIDTH                       Third input tensor width
  */
 __kernel void concatenate_width_x4(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
     TENSOR4D_DECLARATION(src3),
     TENSOR4D_DECLARATION(src4),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH,
+    const int INPUT2_WIDTH,
+    const int INPUT3_WIDTH)
 {
     // Calculate input indices
     const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
@@ -251,18 +259,17 @@ __kernel void concatenate_width_x4(
 
     STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
-#endif /* defined(INPUT1_WIDTH) */
-#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+#endif /* defined(CONCATENATE_WIDTH_X4) */
+#endif /* defined(ELEMENT_SIZE) */
 
-#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#if defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#if defined(CONCATENATE_WIDTH)
 /** This kernel concatenates the input tensor into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -284,11 +291,12 @@ __kernel void concatenate_width_x4(
  * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
  */
-
 __kernel void concatenate_width(
     TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
 {
     // Calculate input indices
     const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
@@ -308,19 +316,18 @@ __kernel void concatenate_width(
     STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
-
-#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
+#endif /* defined(CONCATENATE_WIDTH) */
+#endif /* defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
 
 #if defined(VEC_SIZE_LEFTOVER)
-
-#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
+#if defined(CONCATENATE_HEIGHT)
+#if defined(HEIGHT_OFFSET) && defined(VEC_SIZE)
 /** This kernel concatenates the input tensor into the output tensor along the second dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note Vector sizes supported are 2,4,8 and 16.
  * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
@@ -343,11 +350,12 @@ __kernel void concatenate_width(
  * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
  */
-
 __kernel void concatenate_height(
     TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
 {
     const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
 
@@ -365,9 +373,10 @@ __kernel void concatenate_height(
     STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
+#endif /* defined(CONCATENATE_HEIGHT) */
+#endif /* defined(HEIGHT_OFFSET) */
 
-#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
-
+#if defined(CONCATENATE)
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
@@ -410,6 +419,7 @@ __kernel void concatenate(
 
     STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
+#endif // defined(CONCATENATE)
 #endif /* defined(VEC_SIZE_LEFTOVER) */
 #endif /* defined(DATA_TYPE) */
 #endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
index a451c0213b..01ef04a7d6 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
index cfd1f12328..be76929ac8 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
index 9c90969827..753b98d1b0 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
index 62ae36ac5c..d9090dc838 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #if defined(DATA_TYPE) // Compile time constants
 
-/** Performs a copy of input tensor to the output tensor.
+/** Performs a tensor cropping.
  *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
index b1d5e61476..4ac5e3f0e9 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
index f38f6202ad..91e51d9d1a 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT)
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE)
 
 /** List of all the operations supported by this kernel.
  * @note ADD and SUB operations, when executed on integers, support saturation */
@@ -38,12 +38,18 @@
 #define MAX(x, y) max(x, y)
 #define MIN(x, y) min(x, y)
 #define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define DIV(x, y) (x / y)
 #define POWER(x, y) pow(x, y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))))
 
-#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & 1)
-#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & 1)
+#if VEC_SIZE_OUT == 1
+#define PRELU(x, y) (x > 0 ? x : x * y)
+#else // VEC_SIZE_OUT == 1
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
+#endif // VEC_SIZE_OUT == 1
+
+#define DIV(x, y) (x / y)
+
+#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
 
 #define OP_FUN_NAME_STR(op) elementwise_operation_##op
 #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
@@ -56,8 +62,7 @@
  *
  * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
  * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
  *
@@ -88,8 +93,12 @@
  */
 __kernel void OP_FUN_NAME(OP)(
     TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
 {
 #if VEC_SIZE_IN1 == 1
     uint in1_x_offs = 0;
@@ -101,26 +110,37 @@ __kernel void OP_FUN_NAME(OP)(
 #else  // VEC_SIZE_IN2 == 1
     uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
 #endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
     uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
 
     // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_IN1) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_IN2) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
 
     // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
 
     // Calculate and store result
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
     res0 = OP(in_a, in_b);
 #if defined(ACTIVATION_TYPE)
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) */
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
index a08c3b2d47..a11be80875 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,7 @@
 #define MAX(x, y) max((x), (y))
 #define MIN(x, y) min((x), (y))
 #define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
 #define DIV(x, y) (x / y)
 
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
@@ -37,11 +37,11 @@
 #define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
 #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
 
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT)
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE)
 
 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
 
 /** This function executes an element-wise operation among two tensors.
  *
@@ -57,7 +57,7 @@
  * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
  * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
- * @note The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @note The data type must be passed at compile time using -DDATA_TYPE, i.e. -DDATA_TYPE=uchar
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -86,8 +86,12 @@
  */
 __kernel void OP_FUN_NAME(OP)(
     TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
 {
 #if VEC_SIZE_IN1 == 1
     uint in1_x_offs = 0;
@@ -99,15 +103,26 @@ __kernel void OP_FUN_NAME(OP)(
 #else  // VEC_SIZE_IN2 == 1
     uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
 #endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
     uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
 
     // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
 
-    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
-    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
+    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT);
 
     in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
     in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
@@ -118,6 +133,6 @@ __kernel void OP_FUN_NAME(OP)(
     const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
 
     // Store result
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
index 63594aea83..81835108a3 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-#include "warp_helpers.h"
 
 #if defined(DATA_TYPE) && defined(OPERATION)
 
@@ -38,12 +37,13 @@
 #define fabs_op(input) fabs(input)
 // Calculate natural_log
 #define natural_log_op(input) log(input)
-// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
+// Calculate round using round to nearest even rounding mode
+#define round_op(input) rint(input)
+
 #if defined(VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
-#define logical_not_op(input) CONVERT((!input) & 0x1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
 #else // defined(VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
 #define logical_not_op(input) ((!input) & 0x1)
 #endif // defined(VEC_SIZE)
 
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
new file mode 100644
index 0000000000..2e4cdc53fe
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+#if defined(VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary_quantized(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_f32                = CONVERT(data, VEC_FLOAT);
+    data_f32                = (data_f32 - (float)OFFSET_IN) * (float)SCALE_IN;
+    VEC_INT        qres_int = CONVERT_SAT((OPERATION(data_f32) / ((VEC_FLOAT)(float)SCALE_OUT)), VEC_INT) + ((VEC_INT)((int)OFFSET_OUT));
+    const VEC_TYPE qres     = CONVERT_SAT(qres_int, VEC_TYPE);
+    VSTORE(VEC_SIZE)
+    (qres, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
index eb1eec56e7..3f26d0f1a6 100644
--- a/src/core/CL/cl_kernels/fft.cl
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 
+#if defined(DATA_TYPE)
 /** Calculates and applies the twiddle factor to a given input.
  *
  * @param[in]     phi   The angle.
@@ -30,9 +31,10 @@
  */
 #define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
     {                                              \
-        float2 w, tmp;                             \
-        w.x   = native_cos(phi);                   \
-        w.y   = native_sin(phi);                   \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                \
+        w, tmp;                                    \
+        w.x   = cos(phi);                          \
+        w.y   = sin(phi);                          \
         tmp.x = (w.x * input.x) - (w.y * input.y); \
         tmp.y = (w.x * input.y) + (w.y * input.x); \
         input = tmp;                               \
@@ -43,12 +45,13 @@
  * @param[in,out] c0 Complex input 0.
  * @param[in,out] c1 Complex input 1.
  */
-#define DFT_2(c0, c1) \
-    {                 \
-        float2 v0;    \
-        v0 = c0;      \
-        c0 = v0 + c1; \
-        c1 = v0 - c1; \
+#define DFT_2(c0, c1)               \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0;                         \
+        v0 = c0;                    \
+        c0 = v0 + c1;               \
+        c1 = v0 - c1;               \
     }
 
 // radix-3 butterfly unit factors
@@ -60,15 +63,17 @@
  * @param[in,out] c1 Complex input 1.
  * @param[in,out] c2 Complex input 2.
  */
-#define DFT_3(c0, c1, c2)                                  \
-    {                                                      \
-        float2 v0 = c1 + c2;                               \
-        float2 v1 = c1 - c2;                               \
-        c1.x      = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
-        c1.y      = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
-        c2.x      = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
-        c2.y      = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
-        c0        = c0 + v0;                               \
+#define DFT_3(c0, c1, c2)                             \
+    {                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v0 = c1 + c2;                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v1   = c1 - c2;                               \
+        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0   = c0 + v0;                               \
     }
 
 /**Computes radix-4 butterfly unit.
@@ -78,25 +83,26 @@
  * @param[in,out] c2 Complex input 2.
  * @param[in,out] c3 Complex input 3.
  */
-#define DFT_4(c0, c1, c2, c3)  \
-    {                          \
-        float2 v0, v1, v2, v3; \
-        v0   = c0 + c2;        \
-        v1   = c1 + c3;        \
-        v2   = c0 - c2;        \
-        v3.x = c1.y - c3.y;    \
-        v3.y = c3.x - c1.x;    \
-        c0   = v0 + v1;        \
-        c2   = v0 - v1;        \
-        c1   = v2 + v3;        \
-        c3   = v2 - v3;        \
+#define DFT_4(c0, c1, c2, c3)       \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0, v1, v2, v3;             \
+        v0   = c0 + c2;             \
+        v1   = c1 + c3;             \
+        v2   = c0 - c2;             \
+        v3.x = c1.y - c3.y;         \
+        v3.y = c3.x - c1.x;         \
+        c0   = v0 + v1;             \
+        c2   = v0 - v1;             \
+        c1   = v2 + v3;             \
+        c3   = v2 - v3;             \
     }
 
 // radix-5 butterfly unit factors
-#define W5_A 0.30901699437494f
-#define W5_B 0.95105651629515f
-#define W5_C 0.80901699437494f
-#define W5_D 0.58778525229247f
+#define W5_A (DATA_TYPE)0.30901699437494f
+#define W5_B (DATA_TYPE)0.95105651629515f
+#define W5_C (DATA_TYPE)0.80901699437494f
+#define W5_D (DATA_TYPE)0.58778525229247f
 
 /** Computes radix-5 butterfly unit.
  *
@@ -106,28 +112,29 @@
  * @param[in,out] c3 Complex input 3.
  * @param[in,out] c4 Complex input 4.
  */
-#define DFT_5(c0, c1, c2, c3, c4)                 \
-    {                                             \
-        float2 v0, v1, v2, v3, v4;                \
-        v0 = c0;                                  \
-        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
-        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
-        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
-        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
-        c0 = v0 + c1 + c2 + c3 + c4;              \
-        c1 = v0 + v1 + (float2)(v4.y, -v4.x);     \
-        c2 = v0 - v2 + (float2)(v3.y, -v3.x);     \
-        c3 = v0 - v2 + (float2)(-v3.y, v3.x);     \
-        c4 = v0 + v1 + (float2)(-v4.y, v4.x);     \
+#define DFT_5(c0, c1, c2, c3, c4)                                  \
+    {                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
+        v0, v1, v2, v3, v4;                                        \
+        v0 = c0;                                                   \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
+        c0 = v0 + c1 + c2 + c3 + c4;                               \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
+        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
+        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
     }
 
 // radix-7 butterfly unit factors
-#define W7_A 0.62348980185873f
-#define W7_B 0.78183148246802f
-#define W7_C 0.22252093395631f
-#define W7_D 0.97492791218182f
-#define W7_E 0.90096886790241f
-#define W7_F 0.43388373911755f
+#define W7_A (DATA_TYPE)0.62348980185873f
+#define W7_B (DATA_TYPE)0.78183148246802f
+#define W7_C (DATA_TYPE)0.22252093395631f
+#define W7_D (DATA_TYPE)0.97492791218182f
+#define W7_E (DATA_TYPE)0.90096886790241f
+#define W7_F (DATA_TYPE)0.43388373911755f
 
 /** Computes radix-7 butterfly unit.
  *
@@ -141,7 +148,8 @@
  */
 #define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
     {                                                                \
-        float2 v0, v1, v2, v3, v4, v5, v6;                           \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
+        v0, v1, v2, v3, v4, v5, v6;                                  \
         v0 = c0;                                                     \
         v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
         v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
@@ -150,12 +158,12 @@
         v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
         v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
         c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
-        c1 = v0 + v1 + (float2)(v4.y, -v4.x);                        \
-        c2 = v0 - v2 + (float2)(v5.y, -v5.x);                        \
-        c3 = v0 - v3 + (float2)(v6.y, -v6.x);                        \
-        c4 = v0 - v3 + (float2)(-v6.y, v6.x);                        \
-        c5 = v0 - v2 + (float2)(-v5.y, v5.x);                        \
-        c6 = v0 + v1 + (float2)(-v4.y, v4.x);                        \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
+        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
+        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
+        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
+        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
     }
 
 /** Computes radix-8 butterfly unit.
@@ -169,52 +177,55 @@
  * @param[in,out] c6 Complex input 6.
  * @param[in,out] c7 Complex input 7.
  */
-#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7)  \
-    {                                          \
-        float2 v0, v1, v2, v3, v4, v5, v6, v7; \
-        float2 s0, s1, s2, s3, s4, s5, s6, s7; \
-        float2 t0, t1, t2;                     \
-        v0   = c0 + c4;                        \
-        v1   = c1 + c5;                        \
-        v2   = c2 + c6;                        \
-        v3   = c3 + c7;                        \
-        v4   = c0 - c4;                        \
-        v5   = c1 - c5;                        \
-        v6   = c2 - c6;                        \
-        v7   = c3 - c7;                        \
-        s0   = v0 + v2;                        \
-        s1   = v1 + v3;                        \
-        s2   = v0 - v2;                        \
-        s3   = v1 - v3;                        \
-        s4.x = v4.x - v6.y;                    \
-        s4.y = v4.y + v6.x;                    \
-        s5.x = v5.x - v7.y;                    \
-        s5.y = v5.y + v7.x;                    \
-        s6.x = v4.x + v6.y;                    \
-        s6.y = v4.y - v6.x;                    \
-        s7.x = v5.x + v7.y;                    \
-        s7.y = v5.y - v7.x;                    \
-        t0.x = -s3.y;                          \
-        t0.y = s3.x;                           \
-        t1.x = M_SQRT1_2_F * (s5.x - s5.y);    \
-        t1.y = M_SQRT1_2_F * (s5.x + s5.y);    \
-        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);   \
-        t2.y = M_SQRT1_2_F * (s7.x - s7.y);    \
-        c0   = s0 + s1;                        \
-        c1   = s6 - t2;                        \
-        c2   = s2 - t0;                        \
-        c3   = s4 - t1;                        \
-        c4   = s0 - s1;                        \
-        c5   = s6 + t2;                        \
-        c6   = s2 + t0;                        \
-        c7   = s4 + t1;                        \
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
+    {                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        v0, v1, v2, v3, v4, v5, v6, v7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        s0, s1, s2, s3, s4, s5, s6, s7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        t0, t1, t2;                           \
+        v0   = c0 + c4;                       \
+        v1   = c1 + c5;                       \
+        v2   = c2 + c6;                       \
+        v3   = c3 + c7;                       \
+        v4   = c0 - c4;                       \
+        v5   = c1 - c5;                       \
+        v6   = c2 - c6;                       \
+        v7   = c3 - c7;                       \
+        s0   = v0 + v2;                       \
+        s1   = v1 + v3;                       \
+        s2   = v0 - v2;                       \
+        s3   = v1 - v3;                       \
+        s4.x = v4.x - v6.y;                   \
+        s4.y = v4.y + v6.x;                   \
+        s5.x = v5.x - v7.y;                   \
+        s5.y = v5.y + v7.x;                   \
+        s6.x = v4.x + v6.y;                   \
+        s6.y = v4.y - v6.x;                   \
+        s7.x = v5.x + v7.y;                   \
+        s7.y = v5.y - v7.x;                   \
+        t0.x = -s3.y;                         \
+        t0.y = s3.x;                          \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
+        c0   = s0 + s1;                       \
+        c1   = s6 - t2;                       \
+        c2   = s2 - t0;                       \
+        c3   = s4 - t1;                       \
+        c4   = s0 - s1;                       \
+        c5   = s6 + t2;                       \
+        c6   = s2 + t0;                       \
+        c7   = s4 + t1;                       \
     }
 
 /** Computes the first stage of a radix-2 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -231,7 +242,7 @@
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_2_first_stage_axis_0(
+__kernel void fft_radix_2_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -248,20 +259,21 @@ kernel void fft_radix_2_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load two complex input values
-    float4 data = vload4(0, (__global float *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input.ptr);
 
     // Compute DFT N = 2
     DFT_2(data.s01, data.s23);
 
     // Store two complex output values
-    vstore4(data, 0, (__global float *)output.ptr);
+    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
 }
 
 /** Computes the first stage of a radix-2 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -278,7 +290,7 @@ kernel void fft_radix_2_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_2_first_stage_axis_1(
+__kernel void fft_radix_2_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -295,22 +307,24 @@ kernel void fft_radix_2_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load two complex input values
-    float2 data1 = vload2(0, (__global float *)input.ptr);
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
 
     // Compute DFT N = 2
     DFT_2(data1, data2);
 
     // Store two complex output values
-    vstore2(data1, 0, (__global float *)output.ptr);
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
 }
 
 /** Computes the first stage of a radix-3 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -327,7 +341,7 @@ kernel void fft_radix_2_first_stage_axis_1(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_3_first_stage_axis_0(
+__kernel void fft_radix_3_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -344,22 +358,24 @@ kernel void fft_radix_3_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load three complex input values
-    float4 data0 = vload4(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
 
     // Compute DFT N = 3
     DFT_3(data0.s01, data0.s23, data1.s01);
 
     // Store three complex output values
-    vstore4(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));
+    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
 }
 
 /** Computes the first stage of a radix-3 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -376,7 +392,7 @@ kernel void fft_radix_3_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_3_first_stage_axis_1(
+__kernel void fft_radix_3_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -393,24 +409,27 @@ kernel void fft_radix_3_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load three complex input values
-    float2 data0 = vload2(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
 
     // Compute DFT N = 3
     DFT_3(data0, data1, data2);
 
     // Store three complex output values
-    vstore2(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
 }
 
 /** Computes the first stage of a radix-4 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -427,7 +446,7 @@ kernel void fft_radix_3_first_stage_axis_1(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_4_first_stage_axis_0(
+__kernel void fft_radix_4_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -444,20 +463,21 @@ kernel void fft_radix_4_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load four complex input values
-    float8 data = vload8(0, (__global float *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)input.ptr);
 
     // Compute DFT N = 4
     DFT_4(data.s01, data.s23, data.s45, data.s67);
 
     // Store four complex output values
-    vstore8(data, 0, (__global float *)output.ptr);
+    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
 }
 
 /** Computes the first stage of a radix-4 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -474,7 +494,7 @@ kernel void fft_radix_4_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_4_first_stage_axis_1(
+__kernel void fft_radix_4_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -491,26 +511,30 @@ kernel void fft_radix_4_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load four complex input values
-    float2 data0 = vload2(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
-    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
 
     // Compute DFT N = 4
     DFT_4(data0, data1, data2, data3);
 
     // Store four complex output values
-    vstore2(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
 }
 
 /** Computes the first stage of a radix-5 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -527,7 +551,7 @@ kernel void fft_radix_4_first_stage_axis_1(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_5_first_stage_axis_0(
+__kernel void fft_radix_5_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -544,22 +568,24 @@ kernel void fft_radix_5_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load five complex input values
-    float8 data0 = vload8(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
 
     // Compute DFT N = 5
     DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
 
     // Store five complex output values
-    vstore8(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
 }
 
 /** Computes the first stage of a radix-5 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -576,7 +602,7 @@ kernel void fft_radix_5_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_5_first_stage_axis_1(
+__kernel void fft_radix_5_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -593,28 +619,33 @@ kernel void fft_radix_5_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load five complex input values
-    float2 data0 = vload2(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
-    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
-    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
 
     // Compute DFT N = 5
     DFT_5(data0, data1, data2, data3, data4);
 
     // Store five complex output values
-    vstore2(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
 }
 
 /** Computes the first stage of a radix-7 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -631,7 +662,7 @@ kernel void fft_radix_5_first_stage_axis_1(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_7_first_stage_axis_0(
+__kernel void fft_radix_7_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -648,24 +679,27 @@ kernel void fft_radix_7_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load seven complex input values
-    float8 data0 = vload8(0, (__global float *)input.ptr);
-    float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
 
     // Compute DFT N = 7
     DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
 
     // Store seven complex output values
-    vstore8(data0, 0, (__global float *)output.ptr);
-    vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
 }
 
 /** Computes the first stage of a radix-7 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -682,7 +716,7 @@ kernel void fft_radix_7_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_7_first_stage_axis_1(
+__kernel void fft_radix_7_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -699,32 +733,39 @@ kernel void fft_radix_7_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load seven complex input values
-    float2 data0 = vload2(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
-    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
-    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
-    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
-    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
 
     // Compute DFT N = 7
     DFT_7(data0, data1, data2, data3, data4, data5, data6);
 
     // Store seven complex output values
-    vstore2(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
 }
 
 /** Computes the first stage of a radix-8 DFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -741,7 +782,7 @@ kernel void fft_radix_7_first_stage_axis_1(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_8_first_stage_axis_0(
+__kernel void fft_radix_8_first_stage_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -758,20 +799,21 @@ kernel void fft_radix_8_first_stage_axis_0(
 #endif /* IN_PLACE */
 
     // Load eight complex input values
-    float16 data = vload16(0, (__global float *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
 
     // Compute DFT N = 8
     DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
 
     // Store eight complex output values
-    vstore16(data, 0, (__global float *)output.ptr);
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
 }
 
 /** Computes the first stage of a radix-8 DFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -788,7 +830,7 @@ kernel void fft_radix_8_first_stage_axis_0(
  * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
-kernel void fft_radix_8_first_stage_axis_1(
+__kernel void fft_radix_8_first_stage_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -805,34 +847,42 @@ kernel void fft_radix_8_first_stage_axis_1(
 #endif /* IN_PLACE */
 
     // Load eight complex input values
-    float2 data0 = vload2(0, (__global float *)input.ptr);
-    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
-    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
-    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
-    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
-    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
-    float2 data7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
 
     // Compute DFT N = 8
     DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
 
     // Store eight complex output values
-    vstore2(data0, 0, (__global float *)output.ptr);
-    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
-    vstore2(data7, 0, (__global float *)tensor3D_offset(&output, 0, 7, 0));
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
 }
 
 /** Computes a stage of a radix-2 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -852,7 +902,7 @@ kernel void fft_radix_8_first_stage_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_2_axis_0(
+__kernel void fft_radix_2_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -881,11 +931,13 @@ kernel void fft_radix_2_axis_0(
 #endif /* IN_PLACE */
 
     // Load two complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -894,15 +946,15 @@ kernel void fft_radix_2_axis_0(
     DFT_2(c0, c1);
 
     // Store two complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-2 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -922,7 +974,7 @@ kernel void fft_radix_2_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_2_axis_1(
+__kernel void fft_radix_2_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -951,11 +1003,13 @@ kernel void fft_radix_2_axis_1(
 #endif /* IN_PLACE */
 
     // Load two complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -964,15 +1018,15 @@ kernel void fft_radix_2_axis_1(
     DFT_2(c0, c1);
 
     // Store two complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
 }
 
 /** Computes a stage of a radix-3 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -992,7 +1046,7 @@ kernel void fft_radix_2_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_3_axis_0(
+__kernel void fft_radix_3_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1021,12 +1075,15 @@ kernel void fft_radix_3_axis_0(
 #endif /* IN_PLACE */
 
     // Load three complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1036,16 +1093,16 @@ kernel void fft_radix_3_axis_0(
     DFT_3(c0, c1, c2);
 
     // Store three complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-3 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1065,7 +1122,7 @@ kernel void fft_radix_3_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_3_axis_1(
+__kernel void fft_radix_3_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1094,12 +1151,15 @@ kernel void fft_radix_3_axis_1(
 #endif /* IN_PLACE */
 
     // Load three complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1109,16 +1169,16 @@ kernel void fft_radix_3_axis_1(
     DFT_3(c0, c1, c2);
 
     // Store three complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
 }
 
 /** Computes a stage of a radix-4 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1138,7 +1198,7 @@ kernel void fft_radix_3_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_4_axis_0(
+__kernel void fft_radix_4_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1167,13 +1227,17 @@ kernel void fft_radix_4_axis_0(
 #endif /* IN_PLACE */
 
     // Load four complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1184,17 +1248,17 @@ kernel void fft_radix_4_axis_0(
     DFT_4(c0, c1, c2, c3);
 
     // Store four complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-4 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1214,7 +1278,7 @@ kernel void fft_radix_4_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_4_axis_1(
+__kernel void fft_radix_4_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1243,13 +1307,17 @@ kernel void fft_radix_4_axis_1(
 #endif /* IN_PLACE */
 
     // Load four complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1260,17 +1328,17 @@ kernel void fft_radix_4_axis_1(
     DFT_4(c0, c1, c2, c3);
 
     // Store four complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
 }
 
 /** Computes a stage of a radix-5 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1290,7 +1358,7 @@ kernel void fft_radix_4_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_5_axis_0(
+__kernel void fft_radix_5_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1319,14 +1387,19 @@ kernel void fft_radix_5_axis_0(
 #endif /* IN_PLACE */
 
     // Load five complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1338,18 +1411,18 @@ kernel void fft_radix_5_axis_0(
     DFT_5(c0, c1, c2, c3, c4);
 
     // Store five complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-5 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1369,7 +1442,7 @@ kernel void fft_radix_5_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_5_axis_1(
+__kernel void fft_radix_5_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1398,14 +1471,19 @@ kernel void fft_radix_5_axis_1(
 #endif /* IN_PLACE */
 
     // Load five complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1417,18 +1495,18 @@ kernel void fft_radix_5_axis_1(
     DFT_5(c0, c1, c2, c3, c4);
 
     // Store five complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
 }
 
 /** Computes a stage of a radix-7 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1448,7 +1526,7 @@ kernel void fft_radix_5_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_7_axis_0(
+__kernel void fft_radix_7_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1477,16 +1555,23 @@ kernel void fft_radix_7_axis_0(
 #endif /* IN_PLACE */
 
     // Load seven complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1500,20 +1585,20 @@ kernel void fft_radix_7_axis_0(
     DFT_7(c0, c1, c2, c3, c4, c5, c6);
 
     // Store seven complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-7 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1533,7 +1618,7 @@ kernel void fft_radix_7_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_7_axis_1(
+__kernel void fft_radix_7_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1562,16 +1647,23 @@ kernel void fft_radix_7_axis_1(
 #endif /* IN_PLACE */
 
     // Load seven complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1585,20 +1677,20 @@ kernel void fft_radix_7_axis_1(
     DFT_7(c0, c1, c2, c3, c4, c5, c6);
 
     // Store seven complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
 }
 
 /** Computes a stage of a radix-8 FFT on axis 0.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1618,7 +1710,7 @@ kernel void fft_radix_7_axis_1(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_8_axis_0(
+__kernel void fft_radix_8_axis_0(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1647,17 +1739,25 @@ kernel void fft_radix_8_axis_0(
 #endif /* IN_PLACE */
 
     // Load eight complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
-    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1672,21 +1772,21 @@ kernel void fft_radix_8_axis_0(
     DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
 
     // Store eight complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
-    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
 }
 
 /** Computes a stage of a radix-8 FFT on axis 1.
  *
  * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
  * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -1706,7 +1806,7 @@ kernel void fft_radix_8_axis_0(
  * @param[in]     Ni                                   Nx * Ny.
  * @param[in]     exp_const                            Exponent constant
  */
-kernel void fft_radix_8_axis_1(
+__kernel void fft_radix_8_axis_1(
     TENSOR3D_DECLARATION(input)
 #ifndef IN_PLACE
     ,
@@ -1735,17 +1835,25 @@ kernel void fft_radix_8_axis_1(
 #endif /* IN_PLACE */
 
     // Load eight complex input values
-    float2 c0 = vload2(0, (__global float *)input.ptr);
-    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
-    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
-    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
 
     // Compute phi
-    float phi = (float)nx * exp_const;
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
 
     // Multiply by twiddle factor
     TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
@@ -1760,12 +1868,13 @@ kernel void fft_radix_8_axis_1(
     DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
 
     // Store eight complex output values
-    vstore2(c0, 0, (__global float *)output.ptr);
-    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
-    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 0, 7 * Nx, 0));
-}
-\ No newline at end of file
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
index 200ab91f49..5f64d95bf9 100644
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,10 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE)
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
 /** Computes the digit reverse stage on axis X
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -61,33 +61,36 @@ __kernel void fft_digit_reverse_axis_0(
 
     // Load data
 #if VEC_SIZE == 1
-    float data = *((__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
 #elif VEC_SIZE == 2
-    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
 #else // VEC_SIZE == 1
 #error "vec_size of 1 and 2 are supported"
 #endif // VEC_SIZE == 1
 
     // Create result
 #if VEC_SIZE == 1
-    float2 res = { data, 0 };
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
 #elif VEC_SIZE == 2
-    float2 res  = data;
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
 #else // VEC_SIZE == 1
 #error "vec_size of 1 and 2 are supported"
 #endif // VEC_SIZE == 1
 
     // Store result
 #if defined(CONJ)
-    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
 #else  // defined(CONJ)
-    vstore2(res, 0, (__global float *)dst.ptr);
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
 #endif // defined(CONJ)
 }
 
 /** Computes the digit reverse stage on axis Y
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -122,27 +125,30 @@ __kernel void fft_digit_reverse_axis_1(
 
     // Load data
 #if VEC_SIZE == 1
-    float data = *((__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
 #elif VEC_SIZE == 2
-    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
 #else // VEC_SIZE == 1
 #error "vec_size of 1 and 2 are supported"
 #endif // VEC_SIZE == 1
 
     // Create result
 #if VEC_SIZE == 1
-    float2 res = { data, 0 };
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
 #elif VEC_SIZE == 2
-    float2 res  = data;
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
 #else // VEC_SIZE == 1
 #error "vec_size of 1 and 2 are supported"
 #endif // VEC_SIZE == 1
 
     // Store result
 #if defined(CONJ)
-    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
 #else  // defined(CONJ)
-    vstore2(res, 0, (__global float *)dst.ptr);
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
 #endif // defined(CONJ)
 }
-#endif // defined(VEC_SIZE)
-\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
index 270fb78ae2..c799dd3b9e 100644
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,10 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
 /** Computes the fft scale stage
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -62,17 +63,19 @@ __kernel void fft_scale_conj(
 
     // Store result
 #if VEC_SIZE == 1
-    *((__global float *)dst.ptr) = (*(__global float *)src.ptr) / scale;
+    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
 #elif VEC_SIZE == 2
     // Load data
-    float2 data = vload2(0, (__global float *)src.ptr);
-    data /= scale;
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)src.ptr);
+    data /= (DATA_TYPE)scale;
 #if defined(CONJ)
-    vstore2((float2)(data.s0, -data.s1), 0, (__global float *)dst.ptr);
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
 #else  // defined(CONJ)
-    vstore2(data, 0, (__global float *)dst.ptr);
+    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
 #endif // defined(CONJ)
 #else  // VEC_SIZE == 1
 #error "vec_size of 1 and 2 are supported"
 #endif // VEC_SIZE == 1
-}
-\ No newline at end of file
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
index 5775d899e8..a43343c9f4 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
index 1988ba4e92..f6dd4edd2e 100644
--- a/src/core/CL/cl_kernels/floor.cl
+++ b/src/core/CL/cl_kernels/common/floor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
 /** Perform a floor operation on an input tensor.
  *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
  * @note Can only take floating point data types.
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
@@ -50,9 +53,16 @@ __kernel void floor_layer(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr));
 
-    VSTORE(VEC_SIZE)
-    (floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
index 41f439cb47..e16f4bf315 100644
--- a/src/core/CL/cl_kernels/gather.cl
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
  * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
  *
  *
  * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
@@ -59,33 +58,73 @@
  */
 __kernel void gather(
     TENSOR4D_DECLARATION(input),
-    VECTOR_DECLARATION(indices),
+    TENSOR4D_DECLARATION(indices),
     TENSOR4D_DECLARATION(output))
 {
     const int px = get_global_id(0);
     const int py = get_global_id(1);
     const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+    const int pw = (get_global_id(2) / OUTPUT_DIM_Z );
 
-    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    const Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(indices);
     Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
 
 #if AXIS == 0
-    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, py, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pz, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pw, 0, 0);
+#elif INDICES_DIMS == 4
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, pw);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, 0, 0, 0);
+#endif //INDICES_DIMS
+
 #elif AXIS == 1
-    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, pw, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, 0, 0);
+#endif //INDICES_DIMS
+
 #elif AXIS == 2
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, pw, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, 0);
+#endif //INDICES_DIMS
+
 #elif AXIS == 3
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pw, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, safe_index);
+#endif //INDICES_DIMS
+
 #endif //AXIS
 
-    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+    *(__global DATA_TYPE *)output.ptr = select((DATA_TYPE)0, *((__global const DATA_TYPE *)input_addr), (DATA_TYPE)(index < INDEX_LIMIT));
 }
 
-#endif //defined(DATA_TYPE) && defined(AXIS)
-\ No newline at end of file
+#endif //defined(DATA_TYPE) && defined(AXIS)
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index b6afb85aa4..0c30c0e626 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,856 +24,7 @@
 #include "gemm_helpers.h"
 #include "repeat.h"
 
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
-#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
-#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
-#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
-#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-#define CONCAT_INC(K0) INC##K0
-#define INC(K0) CONCAT_INC(K0)
-
-#if(SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
-    ({                                                                                                                                               \
-        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
-    })
-#else // (SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a) \
-    ({})
-#endif // (SRC_WIDTH % K0)
-
-#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
-    ({                                                                                                           \
-        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-        }                                                                                                        \
-        else                                                                                                     \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
-            }                                                                                                    \
-        }                                                                                                        \
-    })
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                         ,
-                                         uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                        )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    // Load values from the LHS matrix
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if M0 == 2
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 3 // M0 == 3
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 4 // M0 == 4
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 5 // M0 == 5
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
-    ({                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
-        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
-        DATA_TYPE res1 = a4.s##i;                                                                     \
-        VSTORE(4)                                                                                     \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
-        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
-    })
-#elif M0 == 6 // M0 == 6
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(2)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 7 // M0 == 7
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(3)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 8 // M0 == 8
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
-    ({                                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
-        VSTORE(M0)                                                                                                    \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
-    })
-#else // M0 not supported
-#error "M0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                        ,
-                                        uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                       )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (M0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (M0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (M0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Transpose and store block -----------------------
-
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
-#if K0 > 2
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
-#endif // K0 > 2
-#if K0 > 3
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
-#endif // K0 > 3
-#if K0 > 4
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
-#endif // K0 > 4
-#if K0 > 8
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
-#endif // K0 > 8
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-
-#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 1,2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (N0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (N0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (N0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
-                                     x / (uint)H0)
-                                 * (uint)dst_stride_y)
-                                 + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-#if K0 > 1
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#endif // K0 > 1
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if defined(TRANSPOSE)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note The option -DTRANSPOSE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
-                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Transpose the block ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
-
-#if K0 == 2
-    // This part computes the following transpositions:
-    // 2x2 -> 2x2
-    // 2x4 -> 4x2
-    // 2x8 -> 8x2
-    // 2x16 -> 16x2
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
-#endif // N0 > 8
-
-#elif K0 == 3 // K0 == 2
-    // This part computes the following transpositions:
-    // 3x2 -> 2x3
-    // 3x4 -> 4x3
-    // 3x8 -> 8x3
-    // 3x16 -> 16x3
-    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
-    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
-#if N0 > 2
-    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
-    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
-    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
-    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
-    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
-    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
-    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
-    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
-    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
-    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
-    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
-#endif // N0 > 8
-
-#elif K0 == 4 // K0 == 4
-    // This part computes the following transpositions:
-    // 4x2 -> 2x4
-    // 4x4 -> 4x4
-    // 4x8 -> 8x4
-    // 4x16 -> 16x4
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
-#endif // N0 > 8
-
-#elif K0 == 8 // K0 == 8
-    // This part computes the following transpositions:
-    // 8x2 -> 2x8
-    // 8x4 -> 4x8
-    // 8x8 -> 8x8
-    // 8x16 -> 16x8
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
-#endif // N0 > 8
-
-#elif K0 == 16 // K0 == 16
-
-    // This part computes the following transpositions:
-    // 16x2 -> 2x16
-    // 16x4 -> 4x16
-    // 16x8 -> 8x16
-    // 16x16 -> 16x16
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
-                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
-                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
-                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
-                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
-                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
-                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
-                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
-                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
-                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
-                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
-                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
-                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
-                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
-                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
-                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
-                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
-#endif // N0 > 8
-
-#else // N0 == 16
-#error "Not supported N0 value"
-#endif // N0 > 2
-
-    // ---------------------------Store the output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(TRANSPOSE)
-#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
 
 #define CONCAT(a, b) a##b
 
@@ -997,13 +148,13 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
@@ -1055,6 +206,9 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
                                           IMAGE_DECLARATION(rhs),
@@ -1076,7 +230,10 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
                                           ,
                                           uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                         )
+                                          ,
+                                          const int M,
+                                          const int N,
+                                          const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -1096,6 +253,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
     uint y = get_global_id(1);
     uint z = get_global_id(2);
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
 #if defined(DUMMY_WORK_ITEMS)
     if((x * N0 >= N) || (y * M0 >= M))
     {
@@ -1250,7 +410,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -1262,7 +422,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -1275,28 +435,27 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -1346,6 +505,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                   __read_only image2d_t rhs_img,
@@ -1367,12 +529,15 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                   ,
                                                   uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                 )
+                                                  ,
+                                                  const int M,
+                                                  const int N,
+                                                  const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
 
-#define LEFTOVER_K (K % K0)
+    const uint LEFTOVER_K = K % K0;
 
     // Block size
 #define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
@@ -1392,6 +557,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
     uint y = get_global_id(1);
     uint z = get_global_id(2);
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
 #if defined(DUMMY_WORK_ITEMS)
     if((x * N0 >= N) || (y * M0 >= M))
     {
@@ -1472,99 +640,100 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
         x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
     }
 
-#if LEFTOVER_K != 0
-    // Note: We cannot read out-of-bound elements from the RHS matrix because
-    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-    union UNION_VEC_TYPE
+    if(LEFTOVER_K != 0)
     {
-        DATA_TYPE s[K0];
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        v;
-    };
-
-    union UNION_VEC_TYPE a0 = {.v = 0 };
+        // Note: We cannot read out-of-bound elements from the RHS matrix because
+        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+        // Left-over accumulations for LHS matrix
+
+        union UNION_VEC_TYPE
+        {
+            DATA_TYPE s[K0];
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            v;
+        };
+
+        union UNION_VEC_TYPE a0 = {.v = 0 };
 #if M0 > 1
-    union UNION_VEC_TYPE a1 = {.v = 0 };
+        union UNION_VEC_TYPE a1 = {.v = 0 };
 #endif // M0 > 1
 #if M0 > 2
-    union UNION_VEC_TYPE a2 = {.v = 0 };
+        union UNION_VEC_TYPE a2 = {.v = 0 };
 #endif // M0 > 2
 #if M0 > 3
-    union UNION_VEC_TYPE a3 = {.v = 0 };
+        union UNION_VEC_TYPE a3 = {.v = 0 };
 #endif // M0 > 3
 #if M0 > 4
-    union UNION_VEC_TYPE a4 = {.v = 0 };
+        union UNION_VEC_TYPE a4 = {.v = 0 };
 #endif // M0 > 4
 #if M0 > 5
-    union UNION_VEC_TYPE a5 = {.v = 0 };
+        union UNION_VEC_TYPE a5 = {.v = 0 };
 #endif // M0 > 5
 #if M0 > 6
-    union UNION_VEC_TYPE a6 = {.v = 0 };
+        union UNION_VEC_TYPE a6 = {.v = 0 };
 #endif // M0 > 6
 #if M0 > 7
-    union UNION_VEC_TYPE a7 = {.v = 0 };
+        union UNION_VEC_TYPE a7 = {.v = 0 };
 #endif // M0 > 7
 
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
 
-    // Load from RHS matrix
-    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+        // Load from RHS matrix
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
 
-    // Load from LHS matrix
-    for(int k = 0; k < LEFTOVER_K; ++k)
-    {
-        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+        // Load from LHS matrix
+        for(int k = 0; k < LEFTOVER_K; ++k)
+        {
+            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
 #if M0 > 1
-        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
 #endif // M0 > 1
 #if M0 > 2
-        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
 #endif // M0 > 2
 #if M0 > 3
-        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
 #endif // M0 > 3
 #if M0 > 4
-        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
 #endif // M0 > 4
 #if M0 > 5
-        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
 #endif // M0 > 5
 #if M0 > 6
-        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
 #endif // M0 > 6
 #if M0 > 7
-        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
 #endif // M0 > 7
 
-        lhs_offset += sizeof(DATA_TYPE);
-    }
+            lhs_offset += sizeof(DATA_TYPE);
+        }
 
-    // Accumulate
-    ARM_DOT_K0XN0(K0, a0.v, b, c0);
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0.v, b, c0);
 #if M0 > 1
-    ARM_DOT_K0XN0(K0, a1.v, b, c1);
+        ARM_DOT_K0XN0(K0, a1.v, b, c1);
 #endif // M0 > 1
 #if M0 > 2
-    ARM_DOT_K0XN0(K0, a2.v, b, c2);
+        ARM_DOT_K0XN0(K0, a2.v, b, c2);
 #endif // M0 > 2
 #if M0 > 3
-    ARM_DOT_K0XN0(K0, a3.v, b, c3);
+        ARM_DOT_K0XN0(K0, a3.v, b, c3);
 #endif // M0 > 3
 #if M0 > 4
-    ARM_DOT_K0XN0(K0, a4.v, b, c4);
+        ARM_DOT_K0XN0(K0, a4.v, b, c4);
 #endif // M0 > 4
 #if M0 > 5
-    ARM_DOT_K0XN0(K0, a5.v, b, c5);
+        ARM_DOT_K0XN0(K0, a5.v, b, c5);
 #endif // M0 > 5
 #if M0 > 6
-    ARM_DOT_K0XN0(K0, a6.v, b, c6);
+        ARM_DOT_K0XN0(K0, a6.v, b, c6);
 #endif // M0 > 6
 #if M0 > 7
-    ARM_DOT_K0XN0(K0, a7.v, b, c7);
+        ARM_DOT_K0XN0(K0, a7.v, b, c7);
 #endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
+    }
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
@@ -1596,7 +765,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -1608,7 +777,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -1621,22 +790,19 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
-#undef LEFTOVER_K
+#undef RHS_STEP_LOOP
 #undef PIXEL_UNIT
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
 
 #define VFMA(a, b, c)     \
     ({                    \
@@ -1715,12 +881,13 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
@@ -1772,6 +939,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
                                            IMAGE_DECLARATION(rhs),
@@ -1793,7 +963,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
                                            ,
                                            uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                          )
+                                           ,
+                                           const int M,
+                                           const int N,
+                                           const int K)
 {
     // Block size
 #define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -1813,6 +986,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
     uint y = get_global_id(1);
     uint z = get_global_id(2);
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
 #if defined(DUMMY_WORK_ITEMS)
     if((x * N0 >= N) || (y * M0 >= M))
     {
@@ -1992,7 +1168,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -2004,7 +1180,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -2017,28 +1193,27 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -2088,6 +1263,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                    __read_only image2d_t rhs_img,
@@ -2109,7 +1287,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                    ,
                                                    uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                  )
+                                                   ,
+                                                   const int M,
+                                                   const int N,
+                                                   const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -2121,15 +1302,20 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (PIXEL_UNIT)
 #define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP 1
 #else // defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (RHS_BLOCK_SIZE)
 #define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
 
     uint x = get_global_id(0);
     uint y = get_global_id(1);
     uint z = get_global_id(2);
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
 #if defined(DUMMY_WORK_ITEMS)
     if((x * N0 >= N) || (y * M0 >= M))
     {
@@ -2301,7 +1487,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -2313,7 +1499,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -2326,23 +1512,21 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
 
 #if defined(MIXED_PRECISION)
 #if K0 == 2
@@ -2460,6 +1644,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #endif // K0 conditions
 #endif // defined(MIXED_PRECISION)
 
+#if defined(ARM_DOT_K0XN0)
+#undef ARM_DOT_K0XN0
+#endif // defined(ARM_DOT_K0XN0)
+
 #if N0 == 2
 #define ARM_DOT_K0XN0(a, b, c)           \
     ({                                   \
@@ -2517,6 +1705,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #error "N0 value not supported"
 #endif // N0 conditions
 
+#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -2572,12 +1761,14 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(rhs),
@@ -2585,7 +1776,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                             IMAGE_DECLARATION(dst),
-                                            uint k,
                                             uint lhs_stride_z,
                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -2596,7 +1786,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
                                             ,
                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -2652,7 +1845,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
     REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         // Supported cases (M0, K0):
         // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -2701,6 +1894,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
@@ -2726,7 +1922,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -2744,7 +1940,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
                                     2) * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -2763,15 +1959,12 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
@@ -2789,8 +1982,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
+#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -2845,12 +2039,14 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     __read_only image2d_t rhs_img,
@@ -2858,7 +2054,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                                     IMAGE_DECLARATION(dst),
-                                                    uint k,
                                                     uint lhs_stride_z,
                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -2869,7 +2064,10 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
                                                     ,
                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
@@ -2971,6 +2169,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
@@ -2996,7 +2197,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -3014,7 +2215,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
                                     2) * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -3033,15 +2234,12 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
@@ -3060,7 +2258,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
 
 #if defined(LHS_TRANSPOSE)
 
@@ -3172,6 +2370,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
     CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
     (M0, N0, TYPE, A, B, C)
 
+#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
@@ -3225,12 +2424,14 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(rhs),
@@ -3238,7 +2439,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                             IMAGE_DECLARATION(dst),
-                                            uint k,
                                             uint lhs_stride_z,
                                             uint rhs_stride_z,
 #if defined(BETA)
@@ -3249,7 +2449,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
                                             ,
                                             uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
 {
     // Block size
 #define LHS_BLOCK_SIZE ((K0) * (M0))
@@ -3280,6 +2483,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
     const uint y = get_global_id(1);
     const uint z = get_global_id(2);
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
 #if defined(DUMMY_WORK_ITEMS)
     if((x * N0 >= N) || (y * M0 >= M))
     {
@@ -3308,7 +2514,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
     __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
     __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
 
-    for(int i = 0; i < k; i += K0)
+    for(int i = 0; i < K; i += K0)
     {
         VEC_DATA_TYPE(DATA_TYPE, M0)
         a0;
@@ -3491,7 +2697,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -3509,7 +2715,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
                                     2) * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -3527,15 +2733,12 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
@@ -3551,8 +2754,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
+#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
 
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
@@ -3560,7 +2764,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The GEMM's dimensions M, N and K must be passed at runtime.
  * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
  *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
  *       could be different from the value returned by get_image_height(rhs_img).
@@ -3605,12 +2809,14 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  */
 __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     __read_only image2d_t rhs_img,
@@ -3618,7 +2824,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     IMAGE_DECLARATION(bias),
 #endif // defined(BETA)
                                                     IMAGE_DECLARATION(dst),
-                                                    uint k,
                                                     uint lhs_stride_z,
                                                     uint rhs_stride_z,
 #if defined(BETA)
@@ -3629,7 +2834,10 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
                                                     ,
                                                     uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
 {
     // Pixel unit
 #define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -3834,6 +3042,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
@@ -3859,7 +3070,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
 
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
@@ -3876,7 +3087,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
 
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #ifndef UNIT_BETA
     SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
@@ -3894,15 +3105,12 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
@@ -3921,13 +3129,13 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #undef LHS_STEP_LOOP
 #undef RHS_STEP_LOOP
 }
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
 
 #endif // defined(LHS_TRANSPOSE)
 
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
 
 #define VFMA(a, b, c)     \
     ({                    \
@@ -4006,13 +3214,13 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #error "M0 not supported"
 #endif // M0 not supported
 
+#if defined(GEMM_MM_NATIVE)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is NOT reshaped
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
  * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -4060,6 +3268,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
  * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
  * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
  * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
  * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
@@ -4074,7 +3285,10 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 #if defined(BETA)
                              uint bias_stride_z,
 #endif //defined(BETA)
-                             uint dst_stride_z
+                             uint      dst_stride_z,
+                             const int M,
+                             const int N,
+                             const int K
 #if defined(REINTERPRET_INPUT_AS_3D)
                              ,
                              uint lhs_cross_plane_pad
@@ -4137,6 +3351,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
 
     int i = 0;
+#if K0 > 1
     for(; i <= (K - K0); i += K0)
     {
         // Supported cases (M0, K0):
@@ -4182,7 +3397,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
         lhs_offset += K0 * sizeof(DATA_TYPE);
         rhs_offset += K0 * rhs_stride_y;
     }
-
+#endif // K0 > 1
     // Left-over accumulations
     for(; i < K; ++i)
     {
@@ -4280,7 +3495,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
@@ -4288,12 +3503,9 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 
     // Store output block
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+#endif // defined(GEMM_MM_NATIVE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
 
 #if defined(BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
@@ -4380,71 +3592,3 @@ __kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
 }
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 #endif // defined(BETA)
-
-#if defined(WIDTH_VECTOR_A)
-/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
- *
- * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
- *
- * @note The input A and matrix B must not be reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- */
-__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
-                             TENSOR3D_DECLARATION(src1),
-                             IMAGE_DECLARATION(dst))
-{
-    int idx = get_global_id(0) * 4;
-    int idy = get_global_id(1);
-
-    // Compute the address for the vector A and matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
-    src_addr.s1 += idx * sizeof(float);
-
-    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
-
-    float4 acc = 0.0f;
-
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
-    {
-        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-        acc += b0 * (float4)a0.s0;
-        acc += b1 * (float4)a0.s1;
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
-    {
-        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-
-        acc += b0 * (float4)a0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
-}
-#endif // defined(WIDTH_VECTOR_A)
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..09b8956b68
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+    uint rhs_y = block_id;
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
+    // not sliding the tensor
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL)
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, IMAGE),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
+    uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+    uint rhs_y = block_id;
+#endif // BATCHED_RHS
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_x += MMUL_K0 * MMUL_N0 * N0;
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/core/CL/cl_kernels/common/gemm_utils.cl b/src/core/CL/cl_kernels/common/gemm_utils.cl
new file mode 100644
index 0000000000..be57d94ce6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_utils.cl
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(RESHAPE_LHS_NT)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int M,
+                                         const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+
+    // Initialize the input tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+    // Load input tile
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+
+    // Store output tile
+    TILE(uint, M0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_NT)
+
+#if defined(RESHAPE_LHS_T)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int M,
+                                        const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+    TILE(DATA_TYPE, K0, M0, in_tr);
+
+    // Initialize the tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    // Load input tile
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    // Transpose input tile
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, k0, 0, 1, K0,
+        {
+            in_tr[k0].s[m0] = in[m0].s[k0];
+        })
+    });
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, K0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    // Store output tile
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_T)
+
+#if defined(RESHAPE_RHS_NT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    for(int i = 0; i < K0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_RHS_NT)
+
+#if defined(RESHAPE_RHS_T)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+    TILE(DATA_TYPE, N0, K0, in_tr);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    // Transpose input tile
+    for(int k0 = 0; k0 < K0; ++k0)
+    {
+        for(int n0 = 0; n0 < N0; ++n0)
+        {
+            in_tr[n0].s[k0] = in[k0].s[n0];
+        }
+    }
+
+    TILE(uint, N0, 1, dst_indirect_y);
+    for(int i = 0; i < N0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#endif // defined(RESHAPE_RHS_T)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 50dda7ef3c..62c4cd31f5 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "gemm_helpers.h"
 #include "helpers_asymm.h"
 #include "repeat.h"
+#include "tile_helpers.h"
 
 #if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
 
@@ -37,7 +38,6 @@
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. These macros use the dot8 instruction */
 #define ARM_DOT1(a, b, c)                                                                                                                               \
     ({                                                                                                                                                  \
         ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
@@ -290,7 +290,7 @@
         (VECTOR_ACC_TYPE, k0, a, b, c);                          \
     })
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -393,10 +393,10 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
 
     // Compute RHS matrix address
-    __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
 
 #if defined(MATRIX_B_DEPTH)
     // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
@@ -461,194 +461,13 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#endif // defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+#define FUSED_OUTPUT_STAGE_FIXED_POINT
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
 
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
-                                              IMAGE_DECLARATION(rhs),
-                                              IMAGE_DECLARATION(dst),
-                                              uint lhs_stride_z,
-                                              uint rhs_stride_z,
-                                              uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                              ,
-                                              uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                              ,
-                                              uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                             )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
-
-        lhs_offset += K0;
-        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
-
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
-        lhs_offset += 1;
-        rhs_offset += 1;
-    }
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Convert and store output block
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
@@ -728,164 +547,162 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
  * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
  */
-__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAGE_DECLARATION(lhs),
-                                                                            IMAGE_DECLARATION(rhs),
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-                                                                            uint dst_stride_z
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
+#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t
+#endif                                         // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
 #if defined(REINTERPRET_INPUT_AS_3D)
-                                                                            ,
-                                                                            uint lhs_cross_plane_pad
+ ,
+ uint lhs_cross_plane_pad
 #endif // REINTERPRET_INPUT_AS_3D
 #if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
+ ,
+ uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
 #if defined(A_OFFSET)
-                                                                            ,
-                                                                            IMAGE_DECLARATION(sum_col)
+ ,
+ IMAGE_DECLARATION(sum_col)
 #endif // defined(A_OFFSET)
 #if defined(B_OFFSET)
-                                                                            ,
-                                                                            IMAGE_DECLARATION(sum_row)
+ ,
+ IMAGE_DECLARATION(sum_row)
 #endif // defined(B_OFFSET)
 #if defined(ADD_BIAS)
-                                                                            ,
-                                                                            VECTOR_DECLARATION(biases)
+ ,
+ VECTOR_DECLARATION(biases)
 #endif // defined(ADD_BIAS)
 #if defined(PER_CHANNEL_QUANTIZATION)
-                                                                            ,
-                                                                            VECTOR_DECLARATION(result_multipliers),
-                                                                            VECTOR_DECLARATION(result_shifts)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
 #endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                                           )
+)
 {
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
+    // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time
+#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
+#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
 
     // RHS offset and step X
 #if defined(RHS_INTERLEAVE)
 #define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
+#define RHS_STEP_X (K0 * H0)
 #else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_OFFSET_X (K0 * N0)
 #define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
+#define RHS_STEP_LOOP (N0 * K0 * H0)
 
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
+    uint x  = GET_SPATIAL_IDX(0, 1, 1);
+    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    uint z  = GET_SPATIAL_IDX(2, 1, 1);
+    int  xo = (x * N0);
 
 #if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
+    if((xo >= N) || (y >= M))
     {
         return;
     }
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+    uint lhs_y = y + z * FULL_LHS_HEIGHT;
 
     // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
+    uint rhs_offset_y = (x / H0) * rhs_stride_y;
 
 #if defined(MATRIX_B_DEPTH)
     // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
 #else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
+    rhs_offset_y += z * rhs_stride_z;
 #endif // defined(MATRIX_B_DEPTH)
 
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
     // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
 
     int i = 0;
     for(; i <= (K - K0); i += K0)
     {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
         // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
 
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+        // // Load values from RHS matrix
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
+        })
 
         // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
 
-        lhs_offset += K0;
-        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+        rhs_offset_x += RHS_STEP_LOOP;
     }
+
+#if((K % K0) != 0)
+
     // Left-over accumulations
     for(; i < K; ++i)
     {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
 
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
-        lhs_offset += 1;
-        rhs_offset += 1;
-    }
-    // Result of MM is of type DATA_TYPE
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
 
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
+        })
 
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
 
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
+        rhs_offset_x += 1;
+    }
+#endif // ((K % K0) != 0)
 
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
 
-    // Convert result of matrix multiplication to S32
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
+    TILE(int, M0, N0, c_int);
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
 
-    // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
 
 #if defined(A_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
 
 #if defined(SUM_COL_HAS_BATCHES)
-    sum_col_addr += z * sum_col_stride_y;
+    int sum_col_y = z;
+#else  // defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = 0;
 #endif // defined(SUM_COL_HAS_BATCHES)
-    VEC_DATA_TYPE(int, N0)
-    a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
-    a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
+    TILE(int, 1, N0, a_offset_s32);
 
-    REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
+    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
@@ -893,71 +710,96 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
     // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
     // does not introduce paddings. For this reason is safe to access the tensor in this manner
     // without considering that the coordinate "y" could come from an input 3D tensor
-    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (COMPUTE_M0_START_ROW(y, (uint)M0, PARTIAL_STORE_M0)) * sizeof(int) + z * sum_row_stride_y;
+    TILE(int, M0, N0, b_offset_s32);
 
-    LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
+    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
+    })
 
-    REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
 #endif // defined(B_OFFSET)
 
 #if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
 
-    VEC_DATA_TYPE(int, N0)
-    bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
-    REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
+    TILE(int, 1, N0, bias);
+
+    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
 #endif // defined(ADD_BIAS)
 
-    REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v += offset_s32[i].v;
+    })
+
+    TILE(DATA_TYPE, M0, N0, c_lp);
 
     // Multiply by result_mult_int and shift
 #if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
-    __global uchar *result_shifts_addr      = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+    TILE(int, 1, N0, res_mul);
+    TILE(int, 1, N0, res_shift);
 
-    VEC_DATA_TYPE(int, N0)
-    res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
-    VEC_DATA_TYPE(int, N0)
-    res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
-
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-
-#if RESULT_SHIFT < 0
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, RESULT_SHIFT);
-#else  // RESULT_SHIFT >= 0
-    REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, RESULT_SHIFT);
-#endif // RESULT_SHIFT < 0
+    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
+    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
 
+    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    // Add the offset terms to GEMM's result
-    REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
-
 #if defined(MIN_BOUND)
-    REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
+    })
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
+    })
 #endif // defined(MAX_BOUND)
 
-    // Convert and store output block
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    TILE(int, M0, N0, c_lp);
 
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c_int, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
+        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
+        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
+#else  // (REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    })
+
+    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
 
-#undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef RHS_STEP_LOOP
 }
-#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(GEMMLOWP_MM_NATIVE)
 
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
@@ -1140,9 +982,9 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
     REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
     STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#endif // defined(GEMMLOWP_MM_NATIVE)
 
-#if defined(COLS_A)
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
  * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
  *
@@ -1206,8 +1048,9 @@ __kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
 #endif // defined(SCALAR)
     *((__global int *)dst.ptr) = (int)sum_row;
 }
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION)
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
  * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
  *
@@ -1253,17 +1096,17 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
         VEC_DATA_TYPE(DATA_TYPE, 16)
         a0 = vload16(0, matrix_a + i);
 
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
 
         a0 = vload16(1, matrix_a + i);
 
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
     }
 
     // This for loop performs the leftover accumulations
@@ -1277,10 +1120,9 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
 #endif // defined(SCALAR)
     *((__global int *)dst.ptr) = (int)sum_row;
 }
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#endif // defined(COLS_A)
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
 
-#if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#if defined(GEMMLOWP_MATRIX_B_REDUCTION)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
  * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
  *
@@ -1360,7 +1202,7 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
 
     STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#endif // defined(GEMMLOWP_MATRIX_B_REDUCTION)
 
 #endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
 
@@ -1464,6 +1306,7 @@ inline VEC_INT offset_contribution(
     return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
 }
 
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
 /* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
  *
  * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
@@ -1567,8 +1410,9 @@ __kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
     // Store the result with the offset contribution
     STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION)
 
-#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
 /* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
  *
  * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
@@ -1744,7 +1588,9 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
     // Store the result
     STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
 
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
 /* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
  *
  * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
@@ -1925,13 +1771,13 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
     // Store the result
     STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
 
 #undef VEC_INT
 
 #endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
 /** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
  * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
@@ -2027,9 +1873,9 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
     // Store the result
     STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
 
-#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
 /** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
  * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
@@ -2124,10 +1970,9 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
     // Store the result
     STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
 
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
 /** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
  *
  * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
@@ -2216,9 +2061,9 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
     // Store the result
     STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
 
-#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
 /** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
  * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
@@ -2293,12 +2138,13 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 
     VEC_DATA_TYPE(int, VEC_SIZE)
     biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
 #endif // defined(ADD_BIAS)
 
     // Convert to float
-    float4 input_values_f = convert_float4(input_values);
-    input_values_f        = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
+    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
     VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
     res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
@@ -2313,4 +2159,4 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
     // Store the result
     STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
diff --git a/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..72fe3d3b89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=1, -DK0=1).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=1)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=4)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=4)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=16)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 4
+ *  - N0 = 1, 4, 8
+ *  - K0 = 4
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                               Pointer to the LHS tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                          Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                          Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                                 The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                                 The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                                 The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes     The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                               Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                          Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                          Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                                 The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                                 The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                                 The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes     The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                               (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bia_stride_y                          (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                          (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                                 (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                                 (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                                 (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes     (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data type: same as @p lhs_ptr or S32
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                                 The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                 The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                 The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  M                                     Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                     Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                     Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ * @param[in]  sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(ADD_BIAS)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(ADD_BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K
+#if defined(A_OFFSET)
+    ,
+    TENSOR3D_T(sum_col, BUFFER)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    TENSOR3D_T(sum_row, BUFFER)
+#endif // defined(B_OFFSET)
+)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_M0)
+#define VEC_SIZE 4 // For int8 types input to mmul instruction is a length 4 vector
+
+    uint x0 = get_global_id(0);
+    uint y0 = get_global_id(1);
+    uint z  = get_global_id(2);
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    uint lhs_x = VEC_SIZE * block_x;
+    uint lhs_y = dst_y;
+
+    uint rhs_x = VEC_SIZE * N0 * block_y;
+    uint rhs_y = 4 * block_id + block_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(OUT_DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, VEC_SIZE, a);
+        T_LOAD(DATA_TYPE, M0, VEC_SIZE, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        TILE(DATA_TYPE, N0, VEC_SIZE, b);
+        T_LOAD(DATA_TYPE, N0, VEC_SIZE, BUFFER, rhs, 0, 0, 1, VEC_SIZE, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_TYPE vec_a = (VEC_TYPE)(a[m0].s[0], a[m0].s[1], a[m0].s[2], a[m0].s[3]);
+                VEC_TYPE vec_b = (VEC_TYPE)(b[n0].s[0], b[n0].s[1], b[n0].s[2], b[n0].s[3]);
+                c[m0].s[n0]    = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+#if defined(A_OFFSET)
+
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, dst_x, z, 1, sum_col_stride_z, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+
+    TILE(int, M0, 1, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, dst_y, z * M, 1, 4, b_offset_s32);
+
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        offset_s32[m0].v += b_offset_s32[m0].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE) + z * bia_stride_y;
+
+    TILE(int, M0, N0, bias);
+
+    T_LOAD(int, M0, N0, BUFFER, bia, dst_x, dst_y, 1, 1, bias);
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, offset_s32, bias, offset_s32);
+
+#else // defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE);
+
+    TILE(int, 1, N0, bias);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias[0].v = VLOAD(N0)(0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias[0].v, 0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(ADD_BIAS)
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, c, offset_s32, c);
+    TILE(OUT_DATA_TYPE, M0, N0, c_lp);
+    T_QUANTIZE8(ACC_DATA_TYPE, OUT_DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c, 0, 0, c_lp);
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#else  // FUSED_OUTPUT_STAGE_FIXED_POINT
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+#endif // FUSED_OUTPUT_STAGE_FIXED_POINT
+}
+
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
index aaa83975f8..71a372eb29 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
index e8306c55a8..bfe1922ac2 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,18 +59,16 @@ __kernel void generate_proposals_compute_all_anchors(
     Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
     Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
 
-    const size_t idx = get_global_id(0);
+    const unsigned int idx = get_global_id(0);
     // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
+    const unsigned int anchor_idx = idx % NUM_ANCHORS;
 
     // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
+    const unsigned int shift_idx = idx / NUM_ANCHORS;
 
     // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const DATA_TYPE
-    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
-    const DATA_TYPE
-    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
 
     const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
     shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
index 04264197f4..70f861c4b7 100644
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
index 480d9cd20c..f9b3cd3620 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,11 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
  *
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
- * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
- * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
  * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
  *
  * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
@@ -40,6 +37,8 @@
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
@@ -50,38 +49,37 @@
  * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
  */
-__kernel void instance_normalization(
-    TENSOR4D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR4D_DECLARATION(output)
-#endif /* IN_PLACE */
-)
+__kernel void compute_mean_var(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
 {
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-#ifndef IN_PLACE
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-#endif /* IN_PLACE */
-
-    INTERNAL_DATA_TYPE sum    = 0.f;
-    INTERNAL_DATA_TYPE sum_sq = 0.f;
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
 
 #if defined(NHWC)
-
-    const int ch             = get_global_id(0); // Current channel
-    const int batch          = get_global_id(2); // Current batch
-    const int elements_plane = DIM_Y * DIM_Z;
+    const int          ch             = get_global_id(0); // Current channel
+    const int          batch          = get_global_id(1); // Current batch
+    const int          elements_plane = DIM_Y * DIM_Z;
+    INTERNAL_DATA_TYPE part_sum       = 0.f;
+    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
+    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
 
     for(int i_w = 0; i_w < DIM_Y; ++i_w)
     {
         for(int i_h = 0; i_h < DIM_Z; ++i_h)
         {
             INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
-            sum += data;
-            sum_sq += data * data;
+            part_sum += data;
+            part_sum_sq += data * data;
         }
     }
 
+    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
+    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
 #else // !defined(NHWC)
     const int ch             = get_global_id(2) % DIM_Z; // Current channel
     const int batch          = get_global_id(2) / DIM_Z; // Current batch
@@ -127,16 +125,83 @@ __kernel void instance_normalization(
     part_sum.s0 += part_sum.s1;
     part_sum_sq.s0 += part_sum_sq.s1;
 
-    sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
-    sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
+    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+
+    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
+    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
+
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
 
 #endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
+ * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void instance_normalization(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(mean_var)
+#ifndef IN_PLACE
+    ,
+    TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+#endif /* IN_PLACE */
+
+#if defined(NHWC)
+    const int ch    = get_global_id(0); // Current channel
+    const int batch = get_global_id(2); // Current batch
+#else                                   /* defined(NHWC) */
+    const int ch    = get_global_id(2) % DIM_Z; // Current channel
+    const int batch = get_global_id(2) / DIM_Z; // Current batch
+#endif                                  /* defined(NHWC) */
 
-    const INTERNAL_DATA_TYPE mean   = (sum / elements_plane);
-    const INTERNAL_DATA_TYPE var    = (sum_sq / elements_plane) - (mean * mean);
-    const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON);
+    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
+    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
+    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
+    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
+    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
+    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
 
 #if defined(NHWC)
+    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#ifndef IN_PLACE
+    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#endif /* IN_PLACE */
 
     for(int i_w = 0; i_w < DIM_Y; ++i_w)
     {
@@ -151,7 +216,6 @@ __kernel void instance_normalization(
             *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
         }
     }
-
 #else // !defined(NHWC)
     for(int y = 0; y < DIM_Y; ++y)
     {
diff --git a/src/core/CL/cl_kernels/common/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
new file mode 100644
index 0000000000..fbe3406239
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
+/** This kernel performs l2 normalization on x-axis
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(*((__global DATA_TYPE *)sum_addr), epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(sums, epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                           sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))(rsqrt(fmax(sums, epsilon))));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
new file mode 100644
index 0000000000..c7ef8ae52b
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float/half datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
+                                  TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+                                  TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+                                  TENSOR3D_T(dst, BUFFER))
+
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // This part is written to decrease the number of loop unrollings caused
+        // by T_MMUL. The NT/NT version is partly vectorized and uses less number
+        // of loop unrollings, and code behaves as expected. Although this is not
+        // a performant solution for the specified architecture, it is necessary
+        // to overcome some limitations.
+        TILE(DATA_TYPE, K0, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // See the main loop for the explanation of this part
+        TILE(DATA_TYPE, 1, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_T)
+
+#if defined(MAT_MUL_NATIVE_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 > 0
+ * * @note Values > 8 for M0, and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_NT)
+
+#if defined(MAT_MUL_NATIVE_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        TILE(DATA_TYPE, K0, N0, bt);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        TILE(DATA_TYPE, 1, N0, bt);
+
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
new file mode 100644
index 0000000000..e549da86d4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
@@ -0,0 +1,946 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float and half datatypes when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+    // The output/destination matrix is divided into "sections". Each section is filled by a group of
+    // threads of size MMUL_BLOCK_SIZE, bundled together according to GWS_x.
+    // Each thread writes to a tile of M0 x N0 (the usual output block size for a thread) in the output matrix.
+    // Therefore, the section dimensions are (MMUL_M0 x M0) x (MMUL_N0 x N0).
+
+    // The GWS is constructed in such a way that the y global id is the y section coordinate,
+    // and the x global id is a transformed thread id: MMUL_BLOCK_SIZE number of consecutive threads
+    // in the x dimension corresponding to a section.
+    // This can be visualized as first obtaining the coordinates of all the sections:
+    // x = [0, (N / N0) / MMUL_N0) --> (N / N0) / MMUL_N0 is the number of sections in x dimension
+    // y = [0, (M / M0) / MMUL_M0) --> (M / M0) / MMUL_M0 is the number of sections in y dimension
+    // Then multiply the x coordinates with MMUL_SECTION_NUM_THREADS to get the consecutive thread ids in the x dimension
+    // x = [0, ((N / N0) / MMUL_N0) * MMUL_N0 * MMUL_M0)
+    // x = [0, (N / N0) * MMUL_MO)
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Within these sections, each thread writes onto a small output block of size M0 x N0
+    // in row major order. A section divided into tiles can be visualized as below.
+    //
+    //                   (Figure 1)
+    //          A Section in the Output Matrix
+    //
+    //    _____N0__________N0____________________N0____
+    //    |           |          |         |           |
+    //    |           |          |         |           |
+    // M0 |  Thread 1 | Thread 2 |   ...   |  Thread   |
+    //    |           |          |         |  MMUL_N0  |
+    //    |___________|__________|_________|___________|
+    //    |           |                    |           |
+    //    |           |                    |           |
+    // M0 |  Thread   |     .              |           |
+    //    | MMUL_N0+1 |       .            |           | (M0 x MMUL_M0)
+    //    |___________|         .          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |                                |___________|
+    //    |                                |           |
+    //    |                                |  Thread   |
+    // M0 |                                | MMUL_N0 x |
+    //    |                                | MMUL_M0   |
+    //    |________________________________|___________|
+    //                  N0 x MMUL_N0
+    //
+    // The output matrix has several of these sections. As shown above, each section
+    // will be filled by a separate thread group of size MMUL_BLOCK_SIZE. The overall
+    // section layout of the output matrix is as below. For instance, S(1,1) will be filled
+    // by MMUL_BLOCK_SIZE (possibly equal to 16) threads, so as S(0,1) and others.
+    //
+    //                          (Figure 2)
+    //                          DST Matrix
+    //              ____________________________________
+    //             |        |        |       |         |
+    //             | S(0,0) | S(0,1) | ...   | S(0, X) |
+    //             |________|________|_______|_________|
+    //             |        |        |       |         |
+    //             | S(1,0) | S(1,1) | ...   | S(1, X) |
+    //             |________|________|_______|_________|
+    //             |   .    |        |                 |
+    //             |   .    |        |                 |        Y = (M / M0) / MMUL_M0 - 1 (Max possible section y coordinate)
+    //             |   .    |        |                 |        X = (N / N0) / MMUL_N0 - 1 (Max possible section x coordinate)
+    //             |________|________|_________________|
+    //             |        |        |       |         |        S(y, x) denotes the section, and y and x are computed in
+    //             | S(Y,0) | S(Y,1) |       | S(Y, X) |        section_y, section_x respectively.
+    //             |________|________|_______|_________|
+    //
+    //
+    //
+    //
+    // A complete view involving the three matrices is given below. It examplifies how the section S(0,0) is computed.
+    //
+    //                                                    (Figure 3)
+    //                                                  Complete View
+    //
+    //                       LHS Matrix                             RHS Matrix                                          DST Matrix
+    //
+    //                   ___MMUL_K0___________               __MMUL_N0 x N0____________                     ___MMUL_N0 x N0____________________
+    //                  /|xxxxxxxxxx|         |             /|xxxxxxxxxxxxxxx|         |                   /|xxxxxxxxxxxxxxxxxxx|             |
+    //                 / |xxxxxxxxxx|         |    MMUK_K0  ||xxxxxxxxxxxxxxx|         |                  / |xxxxxxxxxxxxxxxxxxx|             |
+    //      MMUL_M0    | |xxxxxxxxxx|  --->   |             ||xxxxxxxxxxxxxxx| . . .   |        MMUL_M0  |  |xxxxxxxxxxxxxxxxxxx|             |
+    //        x M0     | |xxxxxxxxxx|         |             \|_______________|_________|          x M0   |  |xxxxxxxxxxxxxxxxxxx|     ...     |
+    //                 | |xxxxxxxxxx|         |              |                         |                 |  |xxxxxxxxxxxxxxxxxxx|             |
+    //                 | |xxxxxxxxxx|         |     x        |       |                 |   =              \ |xxxxxxxxxxxxxxxxxxx|             |
+    //                  \|__________|_________|              |       |                 |                   \|___________________|             |
+    //                   |                    |              |       \/                |                    |                                 |
+    //                   |   ,                |              |_________________________|                    |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |____________________|                                                             |_________________________________|
+    //
+    // Horizontal and vertical arrows show the direction of K loop (main loop in the kernel).
+    // Each output section shown above is a zooomed out version of Figure 1.
+    //
+    // In each iteration of the main loop, LHS matrix traverses towards rightward, and RHS matrix traverses towards downward,
+    // the LHS section of (MMUL_M0 x M0) x MMUL_K0 and RHS section of MMUL_K0 x (MMUL_N0 x N0) is multiplied
+    // "cooperatively" using arm_matrix_multiply calls, and the result is accummulated over the output (DST) section
+    // of size (MMUL_M0 x M0) x (MMUL_N0 x N0) shown with 'x' signs.
+    //
+    // If it was a single thread, this multiplication would have been straightforward with a T_MMUL call.
+    // However, since it involves multiple threads working together using the aforementioned extension, it
+    // works slightly differently.
+    //
+    // Here is how threads access the LHS and RHS matrices:
+    // (Assume MMUL_K0 = MMUL_N0 = MMUL_M0 = 4 because the following diagram is heavily dependent on this)
+    //
+    //                                              (Figure 4)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                   LHS matrix                                                             RHS Matrix
+    //           ___________________________                     __________N0 times______N0 times____________________N0 times_______
+    //          |__T0__|__T1__|__T2__|__T3__|                   |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //          |__T0__| ...                |                   |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //    M0    |   .    .                  |                   |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //   Times  |   .       .               |                   |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //          |   .           .           |           X
+    //          |__T0__|__T1__|__T2__|__T3__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //    M0    |   .    .                  |
+    //   Times  |   .       .               |
+    //          |   .           .           |
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T8__|__T9__|__T10_|__T11_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //
+    //
+    // This access layout is designed such that the threads access continuous elements of each matrix (in terms of row/column).
+    // To multiply these large sections, the arm_matrix_multiply call is made for each of the M0xN0 elements. So, for each
+    // combination of m0 and n0 (iterators of M0 and N0 from 0 to M0-1 and N0-1 respectively), one arm_matrix_multiply call is
+    // made, and MMUL_BLOCK_SIZE number of threads compute the result.
+    //
+    // The matrix multiplication taking place in this extension
+    // is an "interleaved" one, because, for example, if m0=0 and n0=0, i.e. the first iteration, we would use the first,
+    // M0-th, 2M0-th and 3M0-th rows of the LHS matrix. Similarly, we jump N0 steps in the RHS matrix. This is how we access
+    // one element for each thread in a single (m0, n0) loop.
+    //
+    //   For example, if we have
+    //          - a 8 x 4 LHS section
+    //          - 4 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 4x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = MMUL_K0 = 4
+    //
+    //                                             (Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    4 x 8           [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 4                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    //  The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    //  When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+
+    // Get thread coordinates within an mmul block (of size MMUL_BLOCK_SIZE)
+    // Since threads are grouped in x dimension, the modular of x-dim global id
+    // wrt the MMUL_BLOCK_SIZE is the thread id in the group, ranging from 0 to
+    // MMUL_BLOCK_SIZE-1. Because the thread numbering is in row-major order.
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    // The unclamped dst coordinates can be calculated easily from the output section coordinates and the thread coordinates (see above figure).
+
+    // See Figure 1 & 2. Thread step size is N0 and M0,
+    //                   Section step size is N0 x MMUL_N0 and M0 x MMUL_M0
+    //                   respectively for x, y dimensions.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of K0xN0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in DST matrix
+ * @param[in]  N                                  Number of columns in DST matrix
+ * @param[in]  K                                  Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    uint thread_x  = thread_id % MMUL_N0;
+    uint thread_y  = (thread_id / MMUL_N0);
+
+    // See Nt/Nt kernel for explanations
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    uint lhs_x = dst_y;
+    uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    uint rhs_x = dst_x;
+    uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of rows in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of K0xM0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, M0, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
new file mode 100644
index 0000000000..7f81ac4549
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion
+        // in order to use T_MMUL_NT_T macro because only this macro
+        // can utilize dot product instruction for Int8/UInt8 by
+        // directly multiplying the rows of Lhs and Rhs tensors.
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion.
+        // See the main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0, K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    // Leftover loop
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
new file mode 100644
index 0000000000..fdfb75d39c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at
+ *       compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER
+ *       (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The dimensions M, N, K must be passed at compile time using -DK (e.g. -DM=5, -DN=8, -DK=6).
+ *       K must be a multiple of 16.
+ * @note MMUL block sizes must be passed at compile time using -DMMUL_K0, -DMMUL_M0, -DMMUL_N0
+ *       (e.g. -DMMUL_K0=16, -DMMUL_M0=4, -DMMUL_N0=4)
+ * @note If there is bias -DBIAS option must be passed at compile time
+ * @note Quantization offsets of lhs, rhs and dst tensors must be passed at compile time using -DLHS_OFFSET,
+ *       -DRHS_OFFSET, -DDST_OFFSET (e.g. -DLHS_OFFSET=10, -DRHS_OFFSET=0, -DDST_OFFSET=-6)
+ * @note Effective quantization multiplier and shift for the destination tensor must be passed at compile time using
+ *       -DDST_MULTIPLIER and -DDST_SHIFT (e.g. -DDST_MULTIPLIER=2091, -DST_SHIFT=8)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ * @note For a generic view on how the MMUL works, see mat_mul_mmul.cl
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    // The explanation of how this kernel works is very similar to the explanation given in
+    // mat_mul_mmul.cl. The MMUL logic, and terminology is the same. The only difference is
+    // in quantization multiplication, the MMUL block sizes are (4 x 16) for Lhs matrix and
+    // (16 x 4) for Rhs matrix, resulting in (4 x 4) MMUL block size for the destination.
+    //
+    // Figures 1, 2 and 3 in the previous explanation works the same. Since the Lhs and Rhs
+    // MMUL block sizes are different in quantized extension, the thread access pattern is
+    // slightly different. We can redraw Figure 4 (Thread access pattern) as follows:
+    //
+    //                                 (Modified Figure 4 from mat_mul_mmul.cl)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                                    LHS matrix
+    //                 4 times      4 times          4 times        4 times
+    //           _______________________________________________________________
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T0_| ...                                                       |
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T12|T12|T12|T12|T13|T13|T13|T13|T14|T14|T14|T14|T15|T15|T15|T15|
+    //
+    //
+    //                                                  RHS Matrix
+    //
+    //                   __________N0 times______N0 times____________________N0 times_______
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //       4 times    |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //       4 times    |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //           X      |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //       4 times    |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //       4 times    |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //
+    //
+    // The logic behind this thread access pattern is already descried in the explanation
+    // in mat_mul_mmul.cl. The only change is threads accesses are extended to 4 elements
+    // from 1, in rightward direction in Lhs, and in downward direction in Rhs, because they
+    // are now operating on 4 char/uchar's (again 32-bit data), instead of one 32-bit floating point.
+    //
+    // The mathematical view of the matrix multiplication explained in Figure 5 also holds for this,
+    // except the dimension 4 is 16 instead, but the vector notations do not change, i.e. it's as follows:
+    //
+    //   Settings:
+    //          - a 8 x 16 LHS section
+    //          - 16 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 16x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = 4, MMUL_K0 = 16
+    //
+    //
+    //                                             (Modified Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    16 x 8          [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 16                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    // The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    // When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+    // Please refer to mat_mul_mmul.cl for more details.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);
+            })
+
+#if LHS_OFFSET != 0
+            // Column Sum of B: Calculate the sum of columns by multiplying B
+            // with a matrix of 1's from Left
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+#endif // LHS_OFFSET != 0s
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * Supported block configurations:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, b[n0].v, c[m0].s[n0]);
+            })
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_DATA_TYPE(DATA_TYPE, K0)
+                vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, b[n0].v, c[m0].s[n0]);
+            })
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
index 4141d3e8b7..22abf64874 100644
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,11 @@ __kernel void mean_stddev_normalization(
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     sum = 0.f;
+#ifdef MEANSTDNORM_HALF
+    VEC_DATA_TYPE(float, VEC_SIZE)
+#else  /* MEANSTDNORM_HALF */
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif /* MEANSTDNORM_HALF */
     sum_sq = 0.f;
     // Calculate partial sum
     int i = 0;
@@ -73,34 +77,34 @@ __kernel void mean_stddev_normalization(
         data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
 
         sum += data;
+#ifdef MEANSTDNORM_HALF
+        VEC_DATA_TYPE(float, VEC_SIZE)
+        dsq = CONVERT(data * data, VEC_DATA_TYPE(float, VEC_SIZE));
+        sum_sq += dsq;
+#else  /* MEANSTDNORM_HALF */
         sum_sq += data * data;
+#endif /* MEANSTDNORM_HALF */
     }
     // Perform reduction
-#if VEC_SIZE > 8
-    sum.s01234567 += sum.s89abcdef;
-    sum_sq.s01234567 += sum_sq.s89abcdef;
-#endif // VEC_SIZE > 8
-#if VEC_SIZE > 4
-    sum.s0123 += sum.s4567;
-    sum_sq.s0123 += sum_sq.s4567;
-#endif // VEC_SIZE > 4
-#if VEC_SIZE > 2
-    sum.s01 += sum.s23;
-    sum_sq.s01 += sum_sq.s23;
-#endif // VEC_SIZE > 2
-    sum.s0 += sum.s1;
-    sum_sq.s0 += sum_sq.s1;
+    sum    = SUM_REDUCE(sum, VEC_SIZE);
+    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
+
+#if VEC_SIZE > 1
+#define sum sum.s0
+#define sum_sq sum_sq.s0
+#endif // VEC_SIZE > 1
+
     // Left-overs loop
     for(; i < WIDTH; ++i)
     {
         DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
 
-        sum.s0 += data;
-        sum_sq.s0 += data * data;
+        sum += data;
+        sum_sq += data * data;
     }
 
-    DATA_TYPE mean       = sum.s0 / WIDTH;
-    DATA_TYPE var        = (sum_sq.s0 / WIDTH) - (mean * mean);
+    DATA_TYPE mean       = sum / WIDTH;
+    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
     DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
 
     i = 0;
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
index bb46a49f84..9ff25f3af4 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
index 655696f9a1..49356451df 100644
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
index ab13131807..702e635a89 100644
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
index fe71b5d119..5ae4ec884d 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,15 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
 
 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
 #define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
+#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
 
-#if defined(CONST_VAL)
+#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
 /** Perform a pad operation when PaddingMode is CONSTANT
  *
  * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
@@ -39,7 +40,9 @@
  * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
  * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
  * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
- *       -DNUM_THREADS_TO_SKIP_X compile flag, e.g. -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
+ *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
  * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
  *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
  *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
@@ -76,67 +79,77 @@ __kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
 #endif // defined(PAD_W_BEFORE)
                                 )
 {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
-    uint cond = 0;
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-#if defined(PAD_W_BEFORE)
-    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
-#endif // defined(PAD_W_BEFORE)
+    // If true, write only padding values; no reads performed
+    uint cond = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
 #if defined(PAD_Z_BEFORE)
     cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
 #endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
 
     if(cond)
     {
-        Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-        VSTORE(VEC_SIZE)
-        ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
+        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
     }
     else
     {
-        Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-        Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(NUM_THREADS_TO_SKIP_X)
-        /* In case the pad left is greater than the vector size, and we are past the threads operating solely on pad values,
-         * the input pointer must be brought back along the X axis to start from the first non-pad values.
-         *
-         * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
-         *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) < PAD_X_BEFORE
-         *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, 3)) < PAD_X_BEFORE
-         *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x * VEC_SIZE) == 4) values, reading |4 5|
-         *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and check that it is >= to the current x
-         *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying this constant by the input's step along the X axis
-         *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it will read the desired values |0 1|
-         */
-        src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
-#endif // defined(NUM_THREADS_TO_SKIP_X)
+        // Calculate input's coordinates based on output's
+        int w = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+        x -= THREADS_TO_SKIP_BEFORE;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+        y -= PAD_Y_BEFORE;
+#endif // defined(PAD_Y_BEFORE)
 #if defined(PAD_Z_BEFORE)
-        src.ptr -= PAD_Z_BEFORE * src_step_z;
+        z -= PAD_Z_BEFORE;
 #endif // defined(PAD_Z_BEFORE)
 #if defined(PAD_W_BEFORE)
-        src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+        w -= PAD_W_BEFORE * SRC_DEPTH;
 #endif // defined(PAD_W_BEFORE)
+        x *= VEC_SIZE;
+        x -= PAD_X_BEFORE_REMAINDER;
 
-        VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+        // Check for out of bound reads and clamp X coordinate
+        uint cond_left  = x < 0;
+        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
+        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
 
-        VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
-        VEC_INT cond   = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
-#if defined(PAD_Y_BEFORE)
-        cond |= (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
-#endif // defined(PAD_Y_BEFORE)
-        VSTORE(VEC_SIZE)
-        (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0, (__global DATA_TYPE *)dst.ptr);
+        // Calculate input's address
+        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
+
+        // Read values and rotate them properly if they would have been across paddings
+        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
+
+        // Check what values would be padding and replace them with the constant value
+        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
+        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
+
+        // Store values in bounds
+        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
     }
 }
-#endif // defined(CONST_VAL)
+#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
 
-#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
 
-#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
 #define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
 #define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
 #define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
@@ -232,20 +245,19 @@ __kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
     ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
 #else // SRC_WIDTH == 1
 
-    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
 
     // Choose rearrangement policy based on the defined conditions
-    src_vals = select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
-    src_vals = select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
-    src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
 #if defined(AFTER_PAD_REM)
-    src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
 #endif // defined(AFTER_PAD_REM)
 
-    // Store
-    VSTORE(VEC_SIZE)
-    (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+    // Store values in bounds
+    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
 #endif // SRC_WIDTH == 1
 }
-#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
index db9e7ecc25..1a97ca7495 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ __kernel void permute(TENSOR4D_DECLARATION(input),
 
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
 
     int out_index[4] = { 0 };
     int in_index[4]  = { 0 };
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
index 4fa1551b54..10875293a9 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,10 @@
 #include "activation_float_helpers.h"
 #endif // defined(ACTIVATION_TYPE)
 
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
 /** Performs a pixelwise multiplication with float scale of either integer or float inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -73,41 +77,53 @@
 __kernel void pixelwise_mul_float(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
     TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
     const float scale)
 {
     // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr      = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
 
     // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
 
     // Perform multiplication
 #ifdef DATA_TYPE_FLOAT
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
 #else  /* DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(ACC_DATA_TYPE, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
 #endif /* DATA_TYPE_FLOAT */
 
 #if defined(ACTIVATION_TYPE)
-    vstore16(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else  // defined(ACTIVATION_TYPE)
-    // Store result
-    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
+#if defined(DATA_TYPE)
+
 /** Performs a pixelwise multiplication of complex float values
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -143,16 +159,21 @@ __kernel void pixelwise_mul_complex(
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
-    float2 vin1 = vload2(0, (__global float *)in1.ptr);
-    float2 vin2 = vload2(0, (__global float *)in2.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
 
     // Perform complex multiplication
-    float2 res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
 
 #if defined(ACTIVATION_TYPE)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global float *)out.ptr);
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
 #else  // defined(ACTIVATION_TYPE)
     // Store result
-    vstore2(res, 0, (__global float *)out.ptr);
+    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
 #endif // defined(ACTIVATION_TYPE)
 }
+
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
index b0bd338147..6d1c2d0c79 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,10 @@
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 
 #if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
 /** Performs a pixelwise multiplication with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -72,30 +76,42 @@
 __kernel void pixelwise_mul_int(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
     TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
     const uint scale)
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
 
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
 
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
     // Perform multiplication and store result
-    vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
 
-#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
 
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
 
 /** Performs a pixelwise multiplication with float scale of quantized inputs.
  *
@@ -138,17 +154,31 @@ __kernel void pixelwise_mul_int(
 __kernel void pixelwise_mul_quantized(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
     TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
     const float scale)
 {
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
 
     // Load data
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
 
     // Dequantize
 #if defined(OFFSET_IN1)
@@ -165,10 +195,9 @@ __kernel void pixelwise_mul_quantized(
 #else  // defined(OFFSET_OUT)
     const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
 #endif // defined(OFFSET_OUT)
-    const VEC_TYPE res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
index 24cb111772..4494dd8cec 100644
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
index 3538dae5f0..69cc288c25 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,8 +80,8 @@ __kernel void quantization_layer(
 
     // Create scale and offset vectors
     const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#else // defined(IS_FLOAT)
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+#else  // defined(IS_FLOAT)
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
     val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/common/range.cl
index 1e5c77b376..d25d10e207 100644
--- a/src/core/CL/cl_kernels/range.cl
+++ b/src/core/CL/cl_kernels/common/range.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,29 @@
  */
 #include "helpers.h"
 
-#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
+
+#if !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
+
 /** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
  *
  * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
  * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
  *
  * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -39,45 +55,34 @@
 __kernel void range(
     VECTOR_DECLARATION(out))
 {
-    uint           id      = get_global_id(0) * VECTOR_SIZE;
-    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
 #if VECTOR_SIZE == 1
     DATA_TYPE seq;
     seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
 
-    *((__global DATA_TYPE *)dst_ptr) = seq;
-#else // VECTOR_SIZE == 1
+    *(__global DATA_TYPE *)dst_ptr = seq;
+#else  // VECTOR_SIZE == 1
     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    seq;
-
-    seq.s0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
-#if VECTOR_SIZE > 1
-    seq.s1 = seq.s0 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 2
-    seq.s2 = seq.s1 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 3
-    seq.s3 = seq.s2 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 4
-    seq.s4 = seq.s3 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 5
-    seq.s5 = seq.s4 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 6
-    seq.s6 = seq.s5 + (DATA_TYPE)STEP;
-#if VECTOR_SIZE > 7
-    seq.s7 = seq.s6 + (DATA_TYPE)STEP;
-#endif // VECTOR_SIZE > 7
-#endif // VECTOR_SIZE > 6
-#endif // VECTOR_SIZE > 5
-#endif // VECTOR_SIZE > 4
-#endif // VECTOR_SIZE > 3
-#endif // VECTOR_SIZE > 2
-#endif // VECTOR_SIZE > 1
-    VSTORE(VECTOR_SIZE)
-    (seq, 0, ((__global DATA_TYPE *)dst_ptr));
+    seq0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+    seq0 = seq0 + STEP_VEC;
+    STORE_VECTOR_SELECT(seq, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif //VECTOR_SIZE == 1
 }
 
-#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+#else // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(float, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(float, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(float, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(float, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(float, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
 
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
@@ -99,47 +104,25 @@ __kernel void range(
 __kernel void range_quantized(
     VECTOR_DECLARATION(out))
 {
-    size_t         id      = get_global_id(0) * VECTOR_SIZE;
-    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
 #if VECTOR_SIZE == 1
-    float seq;
-    seq                          = (float)START + (float)id * (float)STEP;
-    seq                          = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
-    seq                          = max(0.0f, min(seq, 255.0f));
-    *((__global uchar *)dst_ptr) = CONVERT_SAT(CONVERT_DOWN(seq, int), uchar);
-#else // VECTOR_SIZE == 1
+    float           seq;
+    seq                            = (float)START + (float)id * (float)STEP;
+    seq                            = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+    seq                            = max(0.0f, min(seq, 255.0f));
+    *(__global DATA_TYPE *)dst_ptr = CONVERT_SAT(CONVERT_DOWN(seq, int), DATA_TYPE);
+#else  // VECTOR_SIZE == 1
     VEC_DATA_TYPE(float, VECTOR_SIZE)
-    seq;
-    seq.s0 = (float)START + id * (float)STEP;
-#if VECTOR_SIZE > 1
-    seq.s1 = seq.s0 + (float)STEP;
-#if VECTOR_SIZE > 2
-    seq.s2 = seq.s1 + (float)STEP;
-#if VECTOR_SIZE > 3
-    seq.s3 = seq.s2 + (float)STEP;
-#if VECTOR_SIZE > 4
-    seq.s4 = seq.s3 + (float)STEP;
-#if VECTOR_SIZE > 5
-    seq.s5 = seq.s4 + (float)STEP;
-#if VECTOR_SIZE > 6
-    seq.s6 = seq.s5 + (float)STEP;
-#if VECTOR_SIZE > 7
-    seq.s7 = seq.s6 + (float)STEP;
-#endif // VECTOR_SIZE > 7
-#endif // VECTOR_SIZE > 6
-#endif // VECTOR_SIZE > 5
-#endif // VECTOR_SIZE > 4
-#endif // VECTOR_SIZE > 3
-#endif // VECTOR_SIZE > 2
-#endif // VECTOR_SIZE > 1
-    seq    = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
-    seq    = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
-    VEC_DATA_TYPE(uchar, VECTOR_SIZE)
-    res = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(uchar, VECTOR_SIZE));
-    VSTORE(VECTOR_SIZE)
-    (res, 0, ((__global DATA_TYPE *)dst_ptr));
+    seq = (float)START + id * (float)STEP;
+    seq = seq + STEP_VEC;
+    seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+    seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    res0 = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif // VECTOR_SIZE == 1
 }
-#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+#endif // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
 
-#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index b2e56928d0..99369be19a 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,74 +25,31 @@
 #include "helpers_asymm.h"
 
 #if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) isgreater(x, y)
-#define ISLESS(x, y) isless(x, y)
+#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
+#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
+#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
+#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
 #else // !FLOAT_DATA_TYPE
 #if defined(WIDTH)
 #define ISGREATER(x, y) (x > y) ? 1 : 0
 #define ISLESS(x, y) (x < y) ? 1 : 0
+#define ISGREATER_SCALAR ISGREATER
+#define ISLESS_SCALAR ISLESS
 #else // !defined(WIDTH)
-#define ISGREATER(x, y) select((int16)0, (int16)-1, x > y)
-#define ISLESS(x, y) select((int16)0, (int16)-1, x < y)
+#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
+#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
 #endif // defined(WIDTH)
 #endif // defined(FLOAT_DATA_TYPE)
 
-/** Calculate square sum of a vector
- *
- * @param[in] input Pointer to the first pixel.
- *
- * @return square sum of vector.
- */
-inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input);
-
-    in *= in;
-
-    in.s01234567 += in.s89ABCDEF;
-    in.s0123 += in.s4567;
-    in.s01 += in.s23;
-
-    return (in.s0 + in.s1);
-}
-
-/** Calculate sum of a vector
- *
- * @param[in] input Pointer to the first pixel.
- *
- * @return sum of vector.
- */
-inline DATA_TYPE sum(__global const DATA_TYPE *input)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input);
-
-    in.s01234567 += in.s89ABCDEF;
-    in.s0123 += in.s4567;
-    in.s01 += in.s23;
-
-    return (in.s0 + in.s1);
-}
-
-/** Calculate product of a vector
- *
- * @param[in] input Pointer to the first pixel.
- *
- * @return product of vector.
- */
-inline DATA_TYPE product(__global const DATA_TYPE *input)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input);
+#if defined(WIDTH)
+#if defined(OPERATION)
 
-    in.s01234567 *= in.s89ABCDEF;
-    in.s0123 *= in.s4567;
-    in.s01 *= in.s23;
+#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
+#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
+#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+#define min_(in0, in1, size) (min(in0, MIN_REDUCE(in1, size)))
+#define max_(in0, in1, size) (max(in0, MAX_REDUCE(in1, size)))
 
-    return (in.s0 * in.s1);
-}
-#if defined(OPERATION)
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
@@ -101,89 +58,84 @@ inline DATA_TYPE product(__global const DATA_TYPE *input)
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
  *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: same as @p src_ptr
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
+ * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void reduction_operation_x(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(partial_res),
-    __local DATA_TYPE *local_results)
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
 {
-    Image src         = CONVERT_TO_IMAGE_STRUCT(src);
-    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-    unsigned int lsize = get_local_size(0);
-    unsigned int lid   = get_local_id(0);
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
 
-    for(unsigned int y = 0; y < get_local_size(1); ++y)
-    {
-        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Perform parallel reduction
-        for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
-        {
-            if(lid < i)
-            {
+#if !defined(MIN) && !defined(MAX)
 #if defined(PROD)
-                local_results[lid] *= local_results[lid + i];
-#else  // !defined(PROD)
-                local_results[lid] += local_results[lid + i];
+    DATA_TYPE res = (DATA_TYPE)1;
+#else  // defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)0;
 #endif // defined(PROD)
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        if(lid == 0)
-        {
-#if defined(MEAN) && defined(WIDTH)
-            if(y == get_local_size(1) - 1)
-            {
-                local_results[0] /= WIDTH;
-            }
-#endif // defined(MEAN) && defined(WIDTH)
-            ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-        }
+#else  // #if !defined(MIN) && !defined(MAX)
+    DATA_TYPE res = *((__global DATA_TYPE *)input_addr);
+#endif // #if defined(MIN) || defined(MAX)
+    int x = 0;
+
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res  = OPERATION(res, vals, VEC_SIZE);
     }
+
+#if(WIDTH % VEC_SIZE)
+    _Pragma("unroll") for(; x < WIDTH; ++x)
+    {
+        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res           = OPERATION(res, val, 1);
+    }
+#endif // (WIDTH % VEC_SIZE)
+
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+    *((__global DATA_TYPE *)output_addr) = res;
 }
 #endif // defined(OPERATION)
-
-#if defined(WIDTH)
 /** This kernel performs reduction on x-axis. (Non parallel)
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
- * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_non_parallel_x(
-    VECTOR_DECLARATION(src),
+    VECTOR_DECLARATION(input),
     VECTOR_DECLARATION(output))
 {
-    Vector src    = CONVERT_TO_VECTOR_STRUCT(src);
+    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
     Vector output = CONVERT_TO_VECTOR_STRUCT(output);
 
-    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, 0)), DATA_TYPE_PROMOTED);
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
 
     // Convert input into F32 in order to perform quantized multiplication
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
@@ -192,11 +144,11 @@ __kernel void reduction_operation_non_parallel_x(
 
     for(unsigned int x = 1; x < WIDTH; ++x)
     {
-        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, x)), DATA_TYPE_PROMOTED);
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
 #if defined(MIN)
-        res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
+        res = select(res, in, ISLESS_SCALAR(in, res));
 #elif defined(MAX)
-        res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
+        res = select(res, in, ISGREATER_SCALAR(in, res));
 #elif defined(PROD)
 #if defined(OFFSET) && defined(SCALE)
         res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
@@ -233,32 +185,37 @@ __kernel void reduction_operation_non_parallel_x(
  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_z,
+    uint            output_offset_first_element_in_bytes)
 {
-    Image src    = CONVERT_TO_IMAGE_STRUCT(src);
-    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int z = get_global_id(1);
 
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 
     // Convert input into F32 in order to perform quantized multiplication
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
 #if defined(SUM_SQUARE)
@@ -267,8 +224,8 @@ __kernel void reduction_operation_y(
 
     for(unsigned int y = 1; y < HEIGHT; ++y)
     {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
 #elif defined(MAX)
@@ -280,7 +237,7 @@ __kernel void reduction_operation_y(
 #if defined(PROD)
 
 #if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
 #endif //  defined(OFFSET) && defined(SCALE)
@@ -302,11 +259,13 @@ __kernel void reduction_operation_y(
 
     // Re-quantize
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
     // Store result
-    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif // defined(HEIGHT)
 
@@ -317,54 +276,51 @@ __kernel void reduction_operation_y(
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_w,
+    uint            output_offset_first_element_in_bytes)
 {
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int w = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
 
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 
     // Convert input into F32 in order to perform quantized multiplication
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
-#if defined(COMPLEX)
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
-#endif // defined(COMPLEX)
 #if defined(SUM_SQUARE)
     res *= res;
 #endif // defined(SUM_SQUARE)
 
     for(unsigned int z = 1; z < DEPTH; ++z)
     {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
-
-#if defined(COMPLEX)
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-        in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
-#endif // defined(COMPLEX)
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
@@ -377,16 +333,13 @@ __kernel void reduction_operation_z(
 #if defined(PROD)
 
 #if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
 #endif //  defined(OFFSET) && defined(SCALE)
 
-#else // !defined(PROD)
+#else  // !defined(PROD)
         res += in;
-#if defined(COMPLEX)
-        res1 += in1;
-#endif // defined(COMPLEX)
 #endif // defined(PROD)
 #endif // defined(MAX) || defined(MIN)
     }
@@ -402,14 +355,14 @@ __kernel void reduction_operation_z(
 
     // Re-quantize
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
     // Store result
-    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
-#if defined(COMPLEX)
-    vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
-#endif // defined(COMPLEX)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(DEPTH) */
 
@@ -418,42 +371,51 @@ __kernel void reduction_operation_z(
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
  * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v                       Stride of the source tensor in V dimension (in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v                      Stride of the output tensor in V dimension (in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
 __kernel void reduction_operation_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_stride_v,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_z,
+    uint            output_stride_v,
+    uint            output_offset_first_element_in_bytes)
 {
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+
+    int gid_2 = get_global_id(2);
+    int z = get_global_id(2) % DEPTH;
+    int v = get_global_id(2) / DEPTH;
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
 
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 
     // Convert input into F32 in order to perform quantized multiplication
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
 #if defined(SUM_SQUARE)
@@ -462,8 +424,8 @@ __kernel void reduction_operation_w(
 
     for(unsigned int w = 1; w < BATCH; ++w)
     {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
 
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
@@ -476,7 +438,7 @@ __kernel void reduction_operation_w(
 #if defined(PROD)
 
 #if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
 #endif //  defined(OFFSET) && defined(SCALE)
@@ -498,10 +460,12 @@ __kernel void reduction_operation_w(
 
     // Re-quantize
 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
     // Store result
-    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
index 2d6a7edade..c47664bf85 100644
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,20 +51,20 @@ __kernel void reshape_layer(TENSOR3D_DECLARATION(input),
                             int2 input_shape,
                             int2 output_shape)
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    int out_x = get_global_id(0);
+    int out_y = get_global_id(1);
+    int out_z = get_global_id(2);
 
-    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+    // Compute the output linearized index
+    int out_linear_idx = out_x + out_y * output_shape.x + out_z * output_shape.x * output_shape.y;
 
-    // Linearize index
-    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
-
-    // Translate to output
-    int3 out_id;
-    out_id.x = linear_idx % output_shape.x;
-    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
-    out_id.z = linear_idx / (output_shape.x * output_shape.y);
+    // Translate to intput
+    int in_x = out_linear_idx % input_shape.x;
+    int in_y = (out_linear_idx / input_shape.x) % input_shape.y;
+    int in_z = out_linear_idx / (input_shape.x * input_shape.y);
 
     // Store result
-    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
+    input_ptr += input_offset_first_element_in_bytes + in_x * input_stride_x + in_y * input_stride_y + in_z * input_stride_z;
+    output_ptr += output_offset_first_element_in_bytes + out_x * output_stride_x + out_y * output_stride_y + out_z * output_stride_z;
+    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)input_ptr);
 }
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
index 10ffe84aeb..e6df3041c2 100644
--- a/src/core/CL/cl_kernels/reverse.cl
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,8 @@
  *
  * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
  * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ * @note The number of dimensions of the source tensor must be given as a preprocessor argument using -DRANK=num, e.g. -DRANK=3
+ * @note The values in axis_tensor must be within [-rank, rank-1].
  *
  * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
@@ -69,7 +71,7 @@ __kernel void reverse(TENSOR4D_DECLARATION(src),
 {
     Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
     Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
-    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
 
     const uint x_in = get_global_id(0);
     const uint y_in = get_global_id(1);
@@ -78,20 +80,24 @@ __kernel void reverse(TENSOR4D_DECLARATION(src),
 
     const uint4 dims       = (uint4)(0, 1, 2, 3);
     int4        to_reverse = (int4)(0, 0, 0, 0);
+
+    VEC_DATA_TYPE(int, NUM_REVERSE_DIMS) indices =  VLOAD(NUM_REVERSE_DIMS)(0,(__global int *)axis.ptr);
+#if defined(USE_INVERTED_AXIS)
+    indices    = select((VEC_DATA_TYPE(int, NUM_REVERSE_DIMS)) RANK - 1, -1, indices < 0) - indices;
+#else /* defined(USE_INVERTED_AXIS) */
+    indices    = select(indices, indices + RANK, indices < 0);
+#endif /* defined(USE_INVERTED_AXIS) */
+
 #if NUM_REVERSE_DIMS == 1
-    const uint index = *((__global uint *)axis.ptr);
-    to_reverse       = (uint4)index == dims;
+    to_reverse = ((uint4)indices == dims);
 #elif NUM_REVERSE_DIMS == 2
-    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
 #elif NUM_REVERSE_DIMS == 3
-    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
-    const uint index2     = *((__global uint *)axis.ptr + 2);
-    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
-#else  /* NUM_REVERSE_DIMS == 3 */
-    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims);
+#else /* NUM_REVERSE_DIMS == 1 */
+    to_reverse    = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
 #endif /* NUM_REVERSE_DIMS == 1 */
+
     const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
     const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
     const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
index e0b98e68c9..8cfe5ddcb6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,7 +173,7 @@ __kernel void roi_align_layer(
     const float2 roi_bin_grid = SAMPLING_RATIO;
 #else  // !defined(SAMPLING_RATIO)
     // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
+    const float2 roi_bin_grid           = ceil(bin_size - EPS_GRID);
 #endif // defined(SAMPLING_RATIO)
 
     // Move input and output pointer across the fourth dimension
@@ -184,7 +184,7 @@ __kernel void roi_align_layer(
 #if defined(NHWC)
         __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
 #else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
 #endif // defined(NHWC)
         *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
                                                          region_start.x,
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
index d5c9a0d9bf..e75dee06f6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
index ac193e8fb6..6899b952e0 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "helpers_asymm.h"
 
 #if DATA_SIZE == 32
 #define VEC_SIZE 4
@@ -29,24 +30,41 @@
 #elif DATA_SIZE == 16
 #define VEC_SIZE 8
 #define VEC_MAX vec8_max
-#else /* DATA_SIZE not equals 32 or 16 */
+#elif DATA_SIZE == 8
+#define VEC_SIZE 16
+#define VEC_MAX vec16_max
+#else /* DATA_SIZE not equals 8, 16, 32 */
 #error "Unsupported data size"
 #endif /* DATA_SIZE == 32 */
 
+// Define whether to use max (Quantized datatype) or fmax (Float) functions
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) max(x, y)
+#else // !(defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) fmax(x, y)
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
 inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
 {
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp = fmax(vec.lo, vec.hi);
-    return fmax(temp.x, temp.y);
+    temp = MAX(vec.lo, vec.hi);
+    return MAX(temp.x, temp.y);
 }
 
 inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
 {
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp = fmax(vec.lo, vec.hi);
+    temp = MAX(vec.lo, vec.hi);
     return vec4_max(temp);
 }
 
+inline DATA_TYPE vec16_max(VEC_DATA_TYPE(DATA_TYPE, 16) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp = MAX(vec.lo, vec.hi);
+    return vec8_max(temp);
+}
+
 /** Performs a roi pooling on a single output pixel.
  *
  * @param[in] input          Pointer to input Tensor3D struct.
@@ -69,7 +87,8 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
     {
         int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
         VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(-FLT_MAX);
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(MIN_VALUE);
+
         for(int j = region_start_y; j < region_end_y; ++j)
         {
             int i = region_start_x;
@@ -77,27 +96,34 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
             {
                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
                 val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
-                curr_max = fmax(val, curr_max);
+                curr_max = MAX(val, curr_max);
             }
             for(; i < region_end_x; ++i)
             {
                 DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
-                curr_max      = fmax(curr_max, val);
+                curr_max      = MAX(curr_max, val);
             }
         }
-        return (DATA_TYPE)VEC_MAX(curr_max);
+
+        const DATA_TYPE temp = (DATA_TYPE)VEC_MAX(curr_max);
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+        return QUANTIZE(temp, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+#endif /* if quantized, requantize and return */
+
+        return temp;
     }
 }
 
 /** Performs a roi pooling function.
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32, QASYMM8;
  * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
  * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
  * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
  * @note Spatial scale must be passed using -DSPATIAL_SCALE;
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32, QASYMM8
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -111,7 +137,7 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
  * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
  * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
  * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as input
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -139,9 +165,9 @@ __kernel void roi_pooling_layer(
 
     // Load roi parameters
     // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    roi               = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const ushort roi_batch = (ushort) * ((__global ushort *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(ushort, 4)
+    roi               = vload4(0, (__global ushort *)offset(&rois, 1, pw));
     const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
     const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
 
diff --git a/src/core/CL/cl_kernels/common/scatter.cl b/src/core/CL/cl_kernels/common/scatter.cl
new file mode 100644
index 0000000000..e3ec9cc98e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/scatter.cl
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// The below defines the various reduce operations for our purposes.
+// Where a corresponds to the existing value, and b the new value.
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+
+#ifdef IS_FLOAT
+#define MAX_OP(a, b) fmax(a, b)
+#define MIN_OP(a, b) fmin(a, b)
+#else // ifdef IS_FLOAT
+#define MAX_OP(a, b) max(a, b)
+#define MIN_OP(a, b) min(a, b)
+#endif // ifdef IS_FLOAT
+
+#define UPDATE_OP(a, b) (b)
+
+#ifdef SCATTER_MP1D_2D_MPND
+
+/** This kernel performs scatter operation
+ *
+ * @note Datatype should be given as a compile-time argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Number of indices should be given as a compile-time argument using -DNUM_INDICES, e.g. -DNUM_INDICES=3
+ * @note Index length should be given as a compile-time argument using -DINDEX_LENGTH, e.g. -DINDEX_LENGTH=2
+ * @note Outermost output shapes should be given as a compile-time argument using -DOUT_SHAPE_N_MINUS_X, where
+ *       X must be 1,2,3,4,5, e.g. -DOUT_SHAPE_N_MINUS_1=3, ...
+ * @note Number of elements to copy in a row should be given as a compile-time argument using -DN0, e.g. -DN0=4
+ * @note Number of partial elements at the edge to copy in a row should be given as a compile-time argument using
+ *       -DPARTIAL_N0, e.g. -DPARTIAL_N0=2
+ * @note Scatter function should be given as a compile-time argument using -DSCATTER_FUNCTION, e.g. -DSCATTER_FUNCTION=ADD
+ * @note If the kernel should skip reading the output tensor, -DSKIP_OUTPUT_READ option should be provided.
+ * @note Kernel name in uppercase letters should be provided as a compile-time argument, e.g. -DSCATTER_MP1D_2D_MPND
+ *
+ * @param[in]  updates_ptr                           Pointer to the updates tensor. Data Types: F32
+ * @param[in]  updates_stride_x                      Stride of the updates tensor in X dimension (in bytes)
+ * @param[in]  updates_step_x                        updates_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  updates_stride_y                      Stride of the updates tensor in Y dimension (in bytes)
+ * @param[in]  updates_step_y                        updates_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  updates_offset_first_element_in_bytes The offset of the first element in the updates tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Data Types: S32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Same as @p upt_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  upt_block_stride                      Update tensor data block stride in bytes
+ * @param[in]  out_block_stride                      Output tensor data block stride in bytes
+ */
+__kernel void scatter_mp1d_2d_mpnd(
+    IMAGE_DECLARATION(updates),
+    IMAGE_DECLARATION(indices),
+    IMAGE_DECLARATION(output),
+    int upt_block_stride,
+    int out_block_stride
+    )
+{
+    const int out_shape[5] = {OUT_SHAPE_N_MINUS_1, OUT_SHAPE_N_MINUS_2, OUT_SHAPE_N_MINUS_3,
+        OUT_SHAPE_N_MINUS_4, OUT_SHAPE_N_MINUS_5};
+
+    const int x = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // x-coordinate in the tensor
+    const int y = get_global_id(1); // collapsed y-coordinate (ignoring the outermost dimensions)
+
+    const bool x_cond = (PARTIAL_N0 != 0 && get_global_id(0) == 0);
+
+    uchar *ind_ptr_raw = indices_ptr + indices_offset_first_element_in_bytes;
+    const uchar *out_ptr_raw =  output_ptr + output_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * output_stride_y;
+
+    const uchar *upt_ptr_raw = updates_ptr + updates_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * updates_stride_y;
+
+    for(int index_element = 0; index_element < NUM_INDICES; ++index_element)
+    {
+        const int *ind_ptr = (const int *) (ind_ptr_raw);
+
+        // Out of bounds check
+        bool out_of_bounds = false;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            if(ind_ptr[i] >= out_shape[i] || ind_ptr[i] < 0)
+            {
+                out_of_bounds = true;
+            }
+        });
+
+        ind_ptr_raw += indices_stride_y;
+
+        if(out_of_bounds)
+        {
+            continue;
+        }
+
+        // Index calculation
+        int index = 0;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            index = index * out_shape[i] + ind_ptr[i];
+        });
+
+        DATA_TYPE *out_ptr = (DATA_TYPE *) (out_ptr_raw + index * out_block_stride);
+
+        const DATA_TYPE *upt_ptr = (const DATA_TYPE *) (upt_ptr_raw + index_element * upt_block_stride);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_in0 = VLOAD(N0)(0, (__global DATA_TYPE *) upt_ptr);
+
+#ifdef SKIP_OUTPUT_READ
+        STORE_VECTOR_SELECT(data_in, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#else // ifdef SKIP_OUTPUT_READ
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_out0 = VLOAD(N0)(0, (__global DATA_TYPE *) out_ptr);
+        data_out0 = SCATTER_FUNCTION(data_out0, data_in0);
+
+        STORE_VECTOR_SELECT(data_out, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#endif // ifdef SKIP_OUTPUT_READ
+    }
+}
+
+#endif // SCATTER_MP1D_2D_MPND
+
+#ifdef SCATTER1D_PARALLEL
+
+// NOTE : This code is non-deterministic and can only be excecuted with the "update" ScatterFunction
+// This code is currently unusued as it requires changes to the existing test suite.
+/** Performs the Scatter1D operation with multiple threads.
+ *  Similar to @ref scatter1D()
+ */
+__kernel void scatter1D_parallel(
+    TENSOR4D_DECLARATION(updates),
+    TENSOR4D_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    // Currently 1D - only iterate through x dimension of indices.
+    const int px = get_global_id(0);
+    const int index_value = *(uchar*)(indices_ptr + indices_offset_first_element_in_bytes + (sizeof(int) * px));
+
+    if(index_value < OUT_SHAPE_X)
+    {
+        const DATA_TYPE update = *(DATA_TYPE *)(updates_ptr + updates_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * px));
+        __global uchar *out_addr = output_ptr + indices_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * index_value);
+        *(__global DATA_TYPE *)(out_addr) = update;
+    }
+}
+
+#endif // SCATTER1D_PARALLEL
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/common/select.cl
index b06a1118a8..6fd4bd4ce3 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/common/select.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** This function perform a select operation between two tensors when condition tensor has the same rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
@@ -68,29 +69,34 @@ __kernel void select_same_rank(
     TENSOR3D_DECLARATION(y),
     TENSOR3D_DECLARATION(out))
 {
-    // Get pixels pointer
-    Tensor3D c_t   = CONVERT_TO_TENSOR3D_STRUCT(c);
-    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
-    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
-    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes + offset + get_global_id(1) * c_step_y + get_global_id(2) * c_step_z;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
 
     // Load values
     SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    in_c = CONVERT(VLOAD(VEC_SIZE)(0, c_addr), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
 
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 
 /** This function perform a select operation between two tensors when condition tensor has a different rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
@@ -129,31 +135,36 @@ __kernel void select_different_rank_2(
 {
     const int c_idx = get_global_id(1);
 
-    // Get pixels pointer
-    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
-    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
-    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
-    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
 
     // Load values
     SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
 
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) */
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** This function perform a select operation between two tensors when condition tensor has a different rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
@@ -192,22 +203,26 @@ __kernel void select_different_rank_n(
 {
     const int c_idx = get_global_id(2) / DEPTH_SIZE;
 
-    // Get pixels pointer
-    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
-    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
-    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
-    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
 
     // Load values
     SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
 
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
-\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER) */
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
index dc3ffd91c1..189d414aba 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,7 @@
  * @attention Supported tensor rank: up to 4
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
+ * @attention Output tensor depht should be given as a preprocessor arguments using -DDST_DEPTH=size
  * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
  * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
  *
@@ -58,7 +58,7 @@ __kernel void strided_slice(
     TENSOR4D_DECLARATION(output))
 {
     // Get pixels pointer
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
     Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
 
     int offset = 0;
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
new file mode 100644
index 0000000000..bfc0995bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half  -HALF_MAX
+#define MIN_VALUE_char  CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
+
+/** 3-pass softmax in the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+    ,
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
+{
+    const int dim_0 = get_global_id(0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+
+#ifdef IS_QUANTIZED
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+    __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
+
+    // Calculate max value.
+    DATA_TYPE max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
+
+        max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
+
+        max_value = max(max_value, data);
+    }
+
+    // Regularize the data.
+    TMP_DATA_TYPE sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+    TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+    TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
+
+        VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
+
+        DATA_TYPE result = NORMALIZE(1, data);
+
+        *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_X
+
+#ifdef SOFTMAX_NON_X
+
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *   - VEC_SIZE_LEFTOVER: the size of the leftover part.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_non_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element,
+
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element,
+
+    uint src_stride_axis,
+    uint dst_stride_axis
+)
+{
+    const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+
+    // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+    //
+    // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+    // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+    // to the second pass pass to reduce memory bandwidth of the first pass.
+    //
+    // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data
+    // back to the temporary tensor.
+    //
+    // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor,
+    // this extra offset is introduced to store the quantized data at the end of the temporary tensor.
+    //
+    // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason
+    // it doesn't work in some devices.
+    uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE));
+
+    // Calculate max value and store the input data to the temporary tensor in suitable format.
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
+
+        max_value = max(max_value, data);
+
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE)));
+    }
+
+    // Regularize the data.
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
+
+        STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
index 438e858df2..2468bf750d 100644
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
index 79da7fe6b9..4d8f802ea1 100644
--- a/src/core/CL/cl_kernels/tile.cl
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@ __kernel void tile(
     TENSOR4D_DECLARATION(input),
     TENSOR4D_DECLARATION(output))
 {
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
 
     // For all coordinates but x, each tile copies from the input
     const int y     = get_global_id(1);
@@ -62,22 +62,18 @@ __kernel void tile(
     // If we are loading/storing multiple elements at time, we need to
     // not exceed the input boundaries. The last threads need to backtrack
     // of OFFSET elements. Those elements cumulates for previous tiles
-    const int id = (int)(get_global_id(0));
-    int       x  = id * VEC_SIZE;
 
-    // Shift x based on the previous offsets
-    const int tile_number = x / SRC_WIDTH;
-    x -= (tile_number) * OFFSET;
-    int x_input = x % SRC_WIDTH;
+    const int id          = (int)(get_global_id(0));
+    const int multiple_no = id / SRC_WIDTH_TILES;
+    const int tile_no     = id % SRC_WIDTH_TILES;
+    const int last_tile   = (int)(tile_no == SRC_WIDTH_TILES - 1);
 
-    // Shift x based on being the last tile
-    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
-    x -= last_tile * OFFSET;
-    x_input = x % SRC_WIDTH;
-    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+    const int x_input  = tile_no * VEC_SIZE - last_tile * OFFSET;
+    const int x_output = multiple_no * SRC_WIDTH + x_input;
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x_output, y, z, batch);
 
     // Copy the data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -88,8 +84,9 @@ __kernel void tile(
 #else  // !defined(VEC_SIZE) || !defined(OFFSET)
     const int x = get_global_id(0);
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x, y, z, batch);
 
     *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
 #endif // defined(VEC_SIZE) && defined(OFFSET)
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
new file mode 100644
index 0000000000..5b4c68ca10
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/transpose.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
+#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
+
+#include "helpers.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
+
+#if VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0                \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1            \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2        \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2, u3    \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0, u1, u2, u3, u4, u5, u6, u7 \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                        \
+    {                                            \
+        u0, u1, u2, u3, u4, u5, u6, u7,          \
+        u8, u9, u10, u11, u12, u13, u14, u15 \
+    }
+#endif /* switch VEC_SIZE_Y */
+#else  // VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val            \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val, u1.val    \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val)      \
+    {                          \
+        u0.val, u1.val, u2.val \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0.val, u1.val, u2.val, u3.val \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)                                              \
+    {                                                                  \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                                                        \
+    {                                                                            \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
+        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
+    }
+#endif /* switch VEC_SIZE_Y */
+#endif // VEC_SIZE_X == 1
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#else /* switch DATA_TYPE_IN_BYTES */
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif /* switch DATA_TYPE_IN_BYTES */
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
+ *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
+ *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
+ *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination matrix in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_gx_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(TENSOR3D_DECLARATION(src),
+                        TENSOR3D_DECLARATION(dst))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
+    uint z_offs = get_global_id(2);
+
+    // Compute addresses
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z;
+
+    // Load the NxM block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
+#if VEC_SIZE_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
+#endif /* VEC_SIZE_Y > 1 */
+#if VEC_SIZE_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif /* VEC_SIZE_Y > 2 */
+#if VEC_SIZE_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif /* VEC_SIZE_Y > 3 */
+#if VEC_SIZE_Y > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif /* VEC_SIZE_Y > 4 */
+#if VEC_SIZE_Y > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
+#endif /* VEC_SIZE_Y > 8 */
+
+    //Create transposed vectors
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t0 = TRANSPOSED_U(s0);
+#if VEC_SIZE_X > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t1 = TRANSPOSED_U(s1);
+#endif /* VEC_SIZE_X > 1 */
+#if VEC_SIZE_X > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t2 = TRANSPOSED_U(s2);
+#endif /* VEC_SIZE_X > 2 */
+#if VEC_SIZE_X > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t3 = TRANSPOSED_U(s3);
+#endif /* VEC_SIZE_X > 3 */
+#if VEC_SIZE_X > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t4 = TRANSPOSED_U(s4);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t5 = TRANSPOSED_U(s5);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t6 = TRANSPOSED_U(s6);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t7 = TRANSPOSED_U(s7);
+#endif /* VEC_SIZE_X > 4 */
+#if VEC_SIZE_X > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t8 = TRANSPOSED_U(s8);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t9 = TRANSPOSED_U(s9);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tA = TRANSPOSED_U(sA);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tB = TRANSPOSED_U(sB);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tC = TRANSPOSED_U(sC);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tD = TRANSPOSED_U(sD);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tE = TRANSPOSED_U(sE);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tF = TRANSPOSED_U(sF);
+#endif /* VEC_SIZE_X > 8 */
+
+    // Store the block at (y, x)
+    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
+                               && get_global_id(0) == 0,
+                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
+}
+
+#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
index 457e9bf8f1..6662dc9360 100644
--- a/src/core/CL/cl_kernels/unpooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
deleted file mode 100644
index 408846144d..0000000000
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define EQUAL(x, y) ((x) == (y))
-#define NOTEQUAL(x, y) ((x) != (y))
-#define GREATER(x, y) ((x) > (y))
-#define GREATEREQUAL(x, y) ((x) >= (y))
-#define LESS(x, y) ((x) < (y))
-#define LESSEQUAL(x, y) ((x) <= (y))
-
-#define DEFINE_KERNEL_STR(name) compare_##name
-#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
-
-#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
-#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
-/** This function compares two tensors.
- *
- * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
-
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
-/** This function compares two quantized tensors.
- *
- * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
-
-    in_a = in_a - (int16)((int)OFFSET_IN1);
-    in_b = in_b - (int16)((int)OFFSET_IN2);
-
-    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const int16   res    = OP(in1f32, in2f32);
-
-    // Store result
-    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
deleted file mode 100644
index 7bca567b11..0000000000
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
-                                                  const short left_coeff,
-                                                  const short middle_coeff,
-                                                  const short right_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
-}
-
-/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src   A pointer to source Image structure
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2,
-    const short mat3, const short mat4, const short mat5,
-    const short mat6, const short mat7, const short mat8, uint scale)
-{
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    // Row 0
-    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
-    // Row
-    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
-    // Row 2
-    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
-
-    // Divide by the scale
-    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
- * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution3x3_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution3x3(&src,
-                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
deleted file mode 100644
index 9995ebfa90..0000000000
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x5(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
-           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    return out;
-}
-
-/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
- * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
- * [ mat10, mat11, mat12, mat13, mat14 ]\n
- * [ mat15, mat16, mat17, mat18, mat19 ]\n
- * [ mat20, mat21, mat22, mat23, mat24 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- * @return a short8 containing 8 convoluted and scaled values.
- */
-short8 convolution5x5(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
-    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
-    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
-    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
-    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable5x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution5x5_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution5x5(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
deleted file mode 100644
index 50fb3d7f35..0000000000
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x7(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
-            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution7x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    return out;
-}
-
-/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
- * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
- * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
- * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
- * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
- * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
- * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution7x7(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
-    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
-    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
-    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
-    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
-    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
-    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable7x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
-
-    // Divide by the scale
-    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution7x7_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution7x7(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
-
-    // Clamp results to [ 0, 255 ] and store them in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
deleted file mode 100644
index 7e77c61fea..0000000000
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] left_pixel   Pointer to the left pixel
- * @param[in] left1_coeff  Weight of the most left pixel
- * @param[in] left2_coeff  Weight of the second left pixel
- * @param[in] left3_coeff  Weight of the third left pixel
- * @param[in] left4_coeff  Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right1_coeff Weight of the right pixel
- * @param[in] right2_coeff Weight of the second right pixel
- * @param[in] right3_coeff Weight of the third right pixel
- * @param[in] right4_coeff Weight of the most right pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(DATA_TYPE, 8)
-convolution1x9(
-    __global const uchar *left_pixel,
-    const short           left1_coeff,
-    const short           left2_coeff,
-    const short           left3_coeff,
-    const short           left4_coeff,
-    const short           middle_coeff,
-    const short           right1_coeff,
-    const short           right2_coeff,
-    const short           right3_coeff,
-    const short           right4_coeff)
-{
-    uchar16 temp = vload16(0, left_pixel);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
-
-    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
-            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
-                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
-}
-
-/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the second up pixel
- * @param[in] up3_coeff    Weight of the third up pixel
- * @param[in] up4_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the second down pixel
- * @param[in] down3_coeff  Weight of the third down pixel
- * @param[in] down4_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-convolution9x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short up3_coeff,
-    const short up4_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff,
-    const short down3_coeff,
-    const short down4_coeff)
-{
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    val;
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
-
-    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
-    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
-
-    return out;
-}
-
-/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
- *
- * Convolution matrix layout:\n
- * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
- * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
- * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
- * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
- * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
- * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
- * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
- * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
- * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
- *
- * @param[in] src   A pointer to source Image structure.
- * @param[in] mat0  Coefficient from the convolution matrix
- * @param[in] mat1  Coefficient from the convolution matrix
- * @param[in] mat2  Coefficient from the convolution matrix
- * @param[in] mat3  Coefficient from the convolution matrix
- * @param[in] mat4  Coefficient from the convolution matrix
- * @param[in] mat5  Coefficient from the convolution matrix
- * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat7  Coefficient from the convolution matrix
- * @param[in] mat8  Coefficient from the convolution matrix
- * @param[in] mat9  Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
- * @param[in] mat11 Coefficient from the convolution matrix
- * @param[in] mat12 Coefficient from the convolution matrix
- * @param[in] mat13 Coefficient from the convolution matrix
- * @param[in] mat14 Coefficient from the convolution matrix
- * @param[in] mat15 Coefficient from the convolution matrix
- * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat17 Coefficient from the convolution matrix
- * @param[in] mat18 Coefficient from the convolution matrix
- * @param[in] mat19 Coefficient from the convolution matrix
- * @param[in] mat20 Coefficient from the convolution matrix
- * @param[in] mat21 Coefficient from the convolution matrix
- * @param[in] mat22 Coefficient from the convolution matrix
- * @param[in] mat23 Coefficient from the convolution matrix
- * @param[in] mat24 Coefficient from the convolution matrix
- * @param[in] mat25 Coefficient from the convolution matrix
- * @param[in] mat26 Coefficient from the convolution matrix
- * @param[in] mat27 Coefficient from the convolution matrix
- * @param[in] mat28 Coefficient from the convolution matrix
- * @param[in] mat29 Coefficient from the convolution matrix
- * @param[in] mat30 Coefficient from the convolution matrix
- * @param[in] mat31 Coefficient from the convolution matrix
- * @param[in] mat32 Coefficient from the convolution matrix
- * @param[in] mat33 Coefficient from the convolution matrix
- * @param[in] mat34 Coefficient from the convolution matrix
- * @param[in] mat35 Coefficient from the convolution matrix
- * @param[in] mat36 Coefficient from the convolution matrix
- * @param[in] mat37 Coefficient from the convolution matrix
- * @param[in] mat38 Coefficient from the convolution matrix
- * @param[in] mat39 Coefficient from the convolution matrix
- * @param[in] mat40 Coefficient from the convolution matrix
- * @param[in] mat41 Coefficient from the convolution matrix
- * @param[in] mat42 Coefficient from the convolution matrix
- * @param[in] mat43 Coefficient from the convolution matrix
- * @param[in] mat44 Coefficient from the convolution matrix
- * @param[in] mat45 Coefficient from the convolution matrix
- * @param[in] mat46 Coefficient from the convolution matrix
- * @param[in] mat47 Coefficient from the convolution matrix
- * @param[in] mat48 Coefficient from the convolution matrix
- * @param[in] mat49 Coefficient from the convolution matrix
- * @param[in] mat50 Coefficient from the convolution matrix
- * @param[in] mat51 Coefficient from the convolution matrix
- * @param[in] mat52 Coefficient from the convolution matrix
- * @param[in] mat53 Coefficient from the convolution matrix
- * @param[in] mat54 Coefficient from the convolution matrix
- * @param[in] mat55 Coefficient from the convolution matrix
- * @param[in] mat56 Coefficient from the convolution matrix
- * @param[in] mat57 Coefficient from the convolution matrix
- * @param[in] mat58 Coefficient from the convolution matrix
- * @param[in] mat59 Coefficient from the convolution matrix
- * @param[in] mat60 Coefficient from the convolution matrix
- * @param[in] mat61 Coefficient from the convolution matrix
- * @param[in] mat62 Coefficient from the convolution matrix
- * @param[in] mat63 Coefficient from the convolution matrix
- * @param[in] mat64 Coefficient from the convolution matrix
- * @param[in] mat65 Coefficient from the convolution matrix
- * @param[in] mat66 Coefficient from the convolution matrix
- * @param[in] mat67 Coefficient from the convolution matrix
- * @param[in] mat68 Coefficient from the convolution matrix
- * @param[in] mat69 Coefficient from the convolution matrix
- * @param[in] mat70 Coefficient from the convolution matrix
- * @param[in] mat71 Coefficient from the convolution matrix
- * @param[in] mat72 Coefficient from the convolution matrix
- * @param[in] mat73 Coefficient from the convolution matrix
- * @param[in] mat74 Coefficient from the convolution matrix
- * @param[in] mat75 Coefficient from the convolution matrix
- * @param[in] mat76 Coefficient from the convolution matrix
- * @param[in] mat77 Coefficient from the convolution matrix
- * @param[in] mat78 Coefficient from the convolution matrix
- * @param[in] mat79 Coefficient from the convolution matrix
- * @param[in] mat80 Coefficient from the convolution matrix
- * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
- *
- */
-short8 convolution9x9(
-    Image      *src,
-    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
-    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
-    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
-    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
-    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
-    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
-    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
-    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
-    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
-    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
-    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
-    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
-    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
-    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
-    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
-    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
-    const short mat80, uint scale)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels;
-
-    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
-    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
-    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
-    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
-    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
-    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
-    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
-    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
-    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
-
-    if(scale > 0)
-    {
-        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
-    }
-
-    return convert_short8_sat(pixels);
-}
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
- *
- * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
- * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable1x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
-
-    // Store result in dst
-    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
- *
- * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_separable9x1_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Output pixels
-    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
-    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
-
-    // Divide by the scale
-    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
-
-    // Store result in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution9x9_static(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short8 pixels = convolution9x9(&src,
-                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
-                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
-                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
-                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
-                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
-                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
-                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
-}
-
-#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
deleted file mode 100644
index 925a698628..0000000000
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "convolution3x3.cl"
-#include "convolution5x5.cl"
-#include "convolution7x7.cl"
-#include "convolution9x9.cl"
-#include "helpers.h"
-
-#define MAT_INDEX(i) MAT##i
-
-#ifndef DATA_TYPE
-#define DATA_TYPE short
-#endif /* DATA_TYPE */
-
-#ifndef COMPUTE_TYPE
-#define COMPUTE_TYPE int
-#endif /* COMPUTE_TYPE */
-
-#ifndef DATA_TYPE_OUT
-#define DATA_TYPE_OUT uchar
-#endif /* DATA_TYPE_OUT */
-
-#ifndef DYNAMIC_MATRIX_CONVOLUTION
-
-/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
- *
- * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
- * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convolution_rectangle(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    short matrix_coeff[81] =
-    {
-        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
-        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
-        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
-        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
-        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
-        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
-        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
-        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
-        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
-    };
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
-
-    for(int i = 0; i < MATRIX_HEIGHT; i++)
-    {
-#if MATRIX_WIDTH == 3
-        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
-                                 matrix_coeff[2 + i * 3]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 5
-        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
-                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 7
-        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
-                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
-                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
-#endif /* MATRIX_WIDTH */
-
-#if MATRIX_WIDTH == 9
-        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
-                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
-                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
-#endif /* MATRIX_WIDTH */
-    }
-
-    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
-
-    // Store the result as is in dst
-    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
-}
-
-#endif /* not DYNAMIC_MATRIX_CONVOLUTION */
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
deleted file mode 100644
index 81fa01ae99..0000000000
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ /dev/null
@@ -1,1879 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#include "activation_float_helpers.h"
-
-/** Get the pointer position at a certain offset in x and y direction.
- *
- * @param[in] ptr      Pointer to the starting position of the buffer
- * @param[in] x        Relative X position
- * @param[in] y        Relative Y position
- * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
- *
- * @return a uchar
- */
-inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
-{
-    return ptr + x * stride_x + y * stride_y;
-}
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                             \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
-    })
-
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                             \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
-        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);            \
-        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);            \
-        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);            \
-        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);            \
-        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);            \
-        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);            \
-    })
-
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                                   \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
-        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
-    })
-
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                                   \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
-        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);                  \
-        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);                  \
-        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);                  \
-        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);                  \
-        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);                  \
-        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);                  \
-        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);                  \
-    })
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                        \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
-        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
-        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
-        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
-    })
-
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                        \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
-        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
-        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
-        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
-    })
-
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                        \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
-        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
-        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
-        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
-        acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2);                                  \
-        acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2);                                   \
-        acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2);                                 \
-        acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3);                                  \
-        acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3);                                   \
-        acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3);                                 \
-    })
-
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
-    ({                                                                                        \
-        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
-        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
-        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
-        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
-        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
-        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
-        acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2);                                  \
-        acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2);                                   \
-        acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2);                                 \
-        acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3);                                  \
-        acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3);                                   \
-        acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3);                                 \
-    })
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
-#if defined(CONV_STRIDE_X)
-
-#if CONV_STRIDE_X == 1
-#define convolution1x3 convolution1x3_stride_1
-#elif CONV_STRIDE_X == 2
-#define convolution1x3 convolution1x3_stride_2
-#elif CONV_STRIDE_X == 3
-#define convolution1x3 convolution1x3_stride_3
-#else /* CONV_STRIDE_X */
-#error "Stride not supported"
-#endif /* CONV_STRIDE_X */
-
-/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp = vload4(0, (__global float *)left_pixel);
-
-    float2 left   = CONVERT(temp.s01, float2);
-    float2 middle = CONVERT(temp.s12, float2);
-    float2 right  = CONVERT(temp.s23, float2);
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
-           + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
-           + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp0 = vload4(0, (__global float *)left_pixel);
-    float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
-
-    float2 left   = CONVERT(temp0.s02, float2);
-    float2 middle = CONVERT(temp0.s13, float2);
-    float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
-
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-    __global float *left_pixel_float = (__global float *)left_pixel;
-
-    return vload4(0, left_pixel_float).s02 * (float2)left_coeff
-           + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
-           + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
-                                      const float           left_coeff,
-                                      const float           middle_coeff,
-                                      const float           right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    float4 temp0 = vload4(0, (__global float *)left_pixel);
-    float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
-
-    float2 left   = CONVERT(temp0.s03, float2);
-    float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
-    float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
-
-    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    __global float *left_pixel_float = (__global float *)left_pixel;
-
-    return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
-           + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
-           + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src  A pointer to source Image structure
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat1 Coefficient from the convolution matrix
- * @param[in] mat2 Coefficient from the convolution matrix
- * @param[in] mat3 Coefficient from the convolution matrix
- * @param[in] mat4 Coefficient from the convolution matrix
- * @param[in] mat5 Coefficient from the convolution matrix
- * @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat7 Coefficient from the convolution matrix
- * @param[in] mat8 Coefficient from the convolution matrix
- *
- * @return a float2 containing 2 convoluted values.
- */
-inline float2 convolution3x3(
-    __global const uchar *src,
-    unsigned int          src_stride_y,
-    const float mat0, const float mat1, const float mat2,
-    const float mat3, const float mat4, const float mat5,
-    const float mat6, const float mat7, const float mat8)
-{
-    float2 pixels;
-
-    pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
-    pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
-    pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
-
-    return pixels;
-}
-
-/** This OpenCL kernel computes the depthwise convolution 3x3
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-    // Load the weights
-    float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    pixels = convolution3x3(src_addr, src_stride_y,
-                            weights_values0.s0, weights_values0.s1, weights_values0.s2,
-                            weights_values1.s0, weights_values1.s1, weights_values1.s2,
-                            weights_values2.s0, weights_values2.s1, weights_values2.s2);
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels += (float2)bias;
-#endif //defined(HAS_BIAS)
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
-}
-#endif //defined(CONV_STRIDE_X)
-
-#if(DILATION_X > 1 || DILATION_Y > 1)
-
-/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    float2 pixels0 = 0.0f;
-
-    float2 src00_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    float2 src00_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    float2 src10_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    float2 src10_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    float2 src20_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    float2 src20_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    float2 pixels0 = 0.0f;
-
-    float3 src00_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    float3 src00_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    float3 src10_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    float3 src10_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    float3 src20_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    float3 src20_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
- * stride_x and stride_y are equal to 1
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels0 = 0.0f;
-    float2 pixels1 = 0.0f;
-    float2 pixels2 = 0.0f;
-    float2 pixels3 = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
-
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 1st row
-    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 3rd row
-    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels0 += (float2)bias;
-    pixels1 += (float2)bias;
-    pixels2 += (float2)bias;
-    pixels3 += (float2)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
- * stride_x and stride_y are equal to 2
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    float2 pixels0 = 0.0f;
-    float2 pixels1 = 0.0f;
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    // Load the weights
-    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
-    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
-    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
-    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
-    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-    float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
-
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
-    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
-
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = *((__global float *)(vector_offset(&biases, channel)));
-
-    pixels0 += (float2)bias;
-    pixels1 += (float2)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-
-#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
-/** Reshape the weights for quantized depthwise convolution
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
- * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
- * @attention Input's height and width should be 3
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void depthwise_convolution_reshape_weights(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Vector    src = CONVERT_TO_VECTOR_STRUCT(src);
-    const int x   = get_global_id(0);
-
-    // Load 3x3xVEC_SIZE weights
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
-
-#if defined(TRANSPOSE)
-#if VEC_SIZE != 4
-#error "VEC_SIZE not supported"
-#else  // VEC_SIZE != 4
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
-#endif // VEC_SIZE != 4
-#else  // !defined(TRANSPOSE)
-    VSTORE(VEC_SIZE)
-    (w0, 0, dst_addr + 0);
-    VSTORE(VEC_SIZE)
-    (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
-    VSTORE(VEC_SIZE)
-    (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
-#endif // defined(TRANSPOSE)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
-#if defined(CONV_STRIDE_X)
-#if CONV_STRIDE_X == 1
-#define convolution1x3_f16 convolution1x3_stride_1_f16
-#elif CONV_STRIDE_X == 2
-#define convolution1x3_f16 convolution1x3_stride_2_f16
-#elif CONV_STRIDE_X == 3
-#define convolution1x3_f16 convolution1x3_stride_3_f16
-#else /* CONV_STRIDE_X */
-#error "Stride not supported"
-#endif /* CONV_STRIDE_X */
-
-#if(DILATION_X > 1 || DILATION_Y > 1)
-
-/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    half4 pixels0 = 0.0f;
-
-    half4 src00_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    half4 src00_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    half4 src10_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    half4 src10_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    half4 src20_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    half4 src20_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
- *
- * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
- * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] y_offset         Offset from the source tensor from which to start convolution
- * @param[in] weights_addr     Pointer from where to get weights
- * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
- */
-inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
-                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
-{
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    half4 pixels0 = 0.0f;
-
-    half8 src00_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
-    half8 src00_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-    half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
-
-    half8 src10_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
-    half8 src10_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-    half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
-
-    half8 src20_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
-    half8 src20_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-    half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
-
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
-
-    return pixels0;
-}
-
-#endif // (DILATION_X > 1 && DILATION_Y > 1)
-
-/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half8 temp = vload8(0, (__global half *)left_pixel);
-
-    half4 left   = CONVERT(temp.s0123, half4);
-    half4 middle = CONVERT(temp.s1234, half4);
-    half4 right  = CONVERT(temp.s2345, half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-    return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
-           + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
-           + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half8 temp0 = vload8(0, (__global half *)left_pixel);
-    half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
-
-    half4 left   = CONVERT(temp0.s0246, half4);
-    half4 middle = CONVERT(temp0.s1357, half4);
-    half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    __global half *left_pixel_float = (__global half *)left_pixel;
-
-    return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
-           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
-           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
- *
- * @param[in] left_pixel   Pointer to the left pixel.
- * @param[in] left_coeff   Weight of the left pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] right_coeff  Weight of the right pixel
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
-                                         const half            left_coeff,
-                                         const half            middle_coeff,
-                                         const half            right_coeff)
-{
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    half16 temp0 = vload16(0, (__global half *)left_pixel);
-
-    half4 left   = CONVERT(temp0.s0369, half4);
-    half4 middle = CONVERT(temp0.s147A, half4);
-    half4 right  = CONVERT(temp0.s258B, half4);
-
-    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    __global half *left_pixel_float = (__global half *)left_pixel;
-
-    return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
-           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
-           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-}
-
-/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src  A pointer to source Image structure
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat1 Coefficient from the convolution matrix
- * @param[in] mat2 Coefficient from the convolution matrix
- * @param[in] mat3 Coefficient from the convolution matrix
- * @param[in] mat4 Coefficient from the convolution matrix
- * @param[in] mat5 Coefficient from the convolution matrix
- * @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
- * @param[in] mat7 Coefficient from the convolution matrix
- * @param[in] mat8 Coefficient from the convolution matrix
- *
- * @return a half4 containing 4 convoluted values.
- */
-inline half4 convolution3x3_f16(
-    Image     *src,
-    const half mat0, const half mat1, const half mat2,
-    const half mat3, const half mat4, const half mat5,
-    const half mat6, const half mat7, const half mat8)
-{
-    half4 pixels;
-
-    pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
-    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
-    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
-
-    return pixels;
-}
-
-#if defined(DEPTH_MULTIPLIER)
-
-/** This OpenCL kernel computes the depthwise convolution 3x3
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif //defined(HAS_BIAS)
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
-    half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
-    half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
-    half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
-
-    half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
-                                      weights_values1.s0, weights_values1.s1, weights_values1.s2,
-                                      weights_values2.s0, weights_values2.s1, weights_values2.s2);
-#if defined(HAS_BIAS)
-    pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
-#endif //defined(HAS_BIAS)
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
-}
-#endif // defined(DEPTH_MULTIPLIER)
-#endif // defined(CONV_STRIDE_X)
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
- * when both stride_x and stride_y are equal to 1
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    half bias = *((__global half *)(vector_offset(&biases, channel)));
-#endif /* defined(HAS_BIAS) */
-
-    half4 pixels0 = 0.0f;
-    half4 pixels1 = 0.0f;
-    half4 pixels2 = 0.0f;
-    half4 pixels3 = 0.0f;
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
-    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-    half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
-
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
-
-#else /* DILATION_X==1 && DILATION_Y==1 */
-
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 1st row
-    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 3rd row
-    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
-
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    pixels0 += (half4)bias;
-    pixels1 += (half4)bias;
-    pixels2 += (half4)bias;
-    pixels3 += (half4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
-}
-
-/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
- * when both stride_x and stride_y are equal to 2
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    half bias = *((__global half *)(vector_offset(&biases, channel)));
-#endif /* defined(HAS_BIAS) */
-
-    half4 pixels0 = 0.0f;
-    half4 pixels1 = 0.0f;
-
-    // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-
-#if(DILATION_X == 1 && DILATION_Y == 1)
-
-    // Load the weights
-    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
-    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
-    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
-
-    // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
-    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
-    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
-    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
-    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
-    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-    half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
-
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
-    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
-
-#else  /* DILATION_X==1 && DILATION_Y==1 */
-    //3x3 Convolution of elements starting in 0th row
-    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
-    //3x3 Convolution of elements starting in 2nd row
-    pixels1                  = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
-#endif /* DILATION_X==1 && DILATION_Y==1 */
-
-#ifdef HAS_BIAS
-    pixels0 += (half4)bias;
-    pixels1 += (half4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
-
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
-/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
- * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
- * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
- * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
- * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void dwc_MxN_native_fp_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(HAS_BIAS)
-)
-{
-    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);
-
-    int x = get_global_id(0); // channels
-    int y = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int z                    = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
-
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
-
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
-
-#if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
-#endif // defined(HAS_BIAS)
-
-#if defined(DST_DEPTH)
-    s_addr += b * src_stride_w;
-    d_addr += b * dst_stride_w;
-#endif // defined(DST_DEPTH)
-
-    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
-    {
-        // Each work-item computes N0x1x1 elements
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        res0 = 0;
-
-        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
-        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-
-        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-        {
-            if(y_coord >= 0 && y_coord < SRC_DIM2)
-            {
-                int x_coord_tmp = x_coord;
-
-                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-                {
-                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
-                    {
-                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
-                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
-
-                        // Load input and weights values
-                        VEC_DATA_TYPE(DATA_TYPE, N0)
-                        i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
-                        VEC_DATA_TYPE(DATA_TYPE, N0)
-                        w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
-
-#if GPU_ARCH == GPU_ARCH_MIDGARD
-                        res0 += i * w;
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-                        res0 = fma(i, w, res0);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-                    }
-                    x_coord_tmp += DILATION_X;
-                }
-            }
-            y_coord += DILATION_Y;
-        }
-
-#if defined(HAS_BIAS)
-        res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
-#endif // defined(HAS_BIAS)
-
-        res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);
-
-        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-
-        w_addr += sizeof(DATA_TYPE);
-        d_addr += sizeof(DATA_TYPE);
-#if defined(HAS_BIAS)
-        b_addr += sizeof(DATA_TYPE);
-#endif // defined(HAS_BIAS)
-    }
-}
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
-
-#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
-
-#if DATA_TYPE != float || DATA_TYPE != half
-#error "Unsupported data type"
-#endif // DATA_TYPE != float || DATA_TYPE != half
-
-#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond)                                                                     \
-    ({                                                                                                                                    \
-        basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
-        basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
-        basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
-    })
-
-#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond)                                                                     \
-    ({                                                                                                                                    \
-        FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond);                                                                    \
-        basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
-    })
-
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-
-/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note In case of biases, -DHAS_BIAS must to be passed at compile
- * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] max_offset                            Max offset for the input tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif /* defined(HAS_BIAS) */
-)
-{
-    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int y        = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int      z               = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
-#endif /* defined(DST_DEPTH) */
-
-    int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
-    int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
-
-    int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
-    int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
-
-    // Use these vectors to check whether the unclamped load would have been out of bounds
-    src_coord_y = (src_offset_y != src_coord_y);
-    src_coord_z = (src_offset_z != src_coord_z);
-
-    src_offset_y *= (int3)src_stride_y;
-    src_offset_z *= (int3)src_stride_z;
-
-    // We compute VEC_SIZEx1x1 [C,W,H] elements
-    VEC_FLOAT acc0 = 0;
-
-    // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
-
-    // Load input values
-    // z == 0
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
-
-    acc0 = fma(values0, w0, acc0);
-    acc0 = fma(values1, w1, acc0);
-    acc0 = fma(values2, w2, acc0);
-
-    // z == 1
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
-
-    acc0 = fma(values0, w3, acc0);
-    acc0 = fma(values1, w4, acc0);
-    acc0 = fma(values2, w5, acc0);
-
-    // z == 2
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
-
-    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
-
-    acc0 = fma(values0, w6, acc0);
-    acc0 = fma(values1, w7, acc0);
-    acc0 = fma(values2, w8, acc0);
-
-#if defined(HAS_BIAS)
-    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
-    VEC_FLOAT bias_values       = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
-    acc0 += bias_values;
-#endif // defined(HAS_BIAS)
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
-#endif /* defined(DST_DEPTH) */
-
-    acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-}
-#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
- * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
- * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
- * @note In case of biases, -DHAS_BIAS must to be passed at compile
- * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
- *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
- * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] max_offset                            Max offset for the input tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- */
-__kernel void depthwise_convolution_3x3_nhwc_stride1(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif /* defined(HAS_BIAS) */
-)
-{
-    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int y        = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int             z        = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
-#endif /* defined(DST_DEPTH) */
-
-    int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
-    int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
-
-    int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
-    int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
-
-    // Use these vectors to check whether the unclamped load would have been out of bounds
-    src_coord_y = (src_offset_y != src_coord_y);
-    src_coord_z = (src_offset_z != src_coord_z);
-
-    src_offset_y *= (int4)src_stride_y;
-    src_offset_z *= (int4)src_stride_z;
-
-    // We compute VEC_SIZEx2x2 [C,W,H] elements
-    VEC_FLOAT acc0 = 0;
-    VEC_FLOAT acc1 = 0;
-    VEC_FLOAT acc2 = 0;
-    VEC_FLOAT acc3 = 0;
-
-    // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
-
-    // Load input values
-    // z == 0
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
-
-    acc0 = fma(values0, w0, acc0);
-    acc0 = fma(values1, w1, acc0);
-    acc0 = fma(values2, w2, acc0);
-    acc1 = fma(values1, w0, acc1);
-    acc1 = fma(values2, w1, acc1);
-    acc1 = fma(values3, w2, acc1);
-
-    // z == 1
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
-
-    acc0 = fma(values0, w3, acc0);
-    acc0 = fma(values1, w4, acc0);
-    acc0 = fma(values2, w5, acc0);
-    acc1 = fma(values1, w3, acc1);
-    acc1 = fma(values2, w4, acc1);
-    acc1 = fma(values3, w5, acc1);
-
-    acc2 = fma(values0, w0, acc2);
-    acc2 = fma(values1, w1, acc2);
-    acc2 = fma(values2, w2, acc2);
-    acc3 = fma(values1, w0, acc3);
-    acc3 = fma(values2, w1, acc3);
-    acc3 = fma(values3, w2, acc3);
-
-    // z == 2
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
-
-    acc0 = fma(values0, w6, acc0);
-    acc0 = fma(values1, w7, acc0);
-    acc0 = fma(values2, w8, acc0);
-    acc1 = fma(values1, w6, acc1);
-    acc1 = fma(values2, w7, acc1);
-    acc1 = fma(values3, w8, acc1);
-
-    acc2 = fma(values0, w3, acc2);
-    acc2 = fma(values1, w4, acc2);
-    acc2 = fma(values2, w5, acc2);
-    acc3 = fma(values1, w3, acc3);
-    acc3 = fma(values2, w4, acc3);
-    acc3 = fma(values3, w5, acc3);
-
-    // z == 3
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
-
-    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
-
-    acc2 = fma(values0, w6, acc2);
-    acc2 = fma(values1, w7, acc2);
-    acc2 = fma(values2, w8, acc2);
-    acc3 = fma(values1, w6, acc3);
-    acc3 = fma(values2, w7, acc3);
-    acc3 = fma(values3, w8, acc3);
-
-#if defined(HAS_BIAS)
-    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
-
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
-
-    acc0 += bias_values;
-    acc1 += bias_values;
-    acc2 += bias_values;
-    acc3 += bias_values;
-#endif // defined(HAS_BIAS)
-
-    int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
-    int  dst_coord_z  = z * NUM_PLANES_PROCESSED;
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
-#else  // defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
-#endif //  defined(DST_DEPTH)
-
-    /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
-     * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
-     * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
-     */
-    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    if((dst_coord_z + 1) < DST_DIM_2)
-#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    {
-        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
-        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-
-        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
-        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
-    }
-}
-
-#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
deleted file mode 100644
index 95cd44eb78..0000000000
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ /dev/null
@@ -1,1795 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifndef VEC_SIZE
-#if defined(N0)
-#define VEC_SIZE N0
-#else /* defined(N0) */
-#define VEC_SIZE 8
-#endif /* defined(N0) */
-#endif /* VEC_SIZE */
-
-#if defined(ACTIVATION_TYPE) && defined(CONST_0)
-#include "activation_layer_quant.cl"
-#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
-#else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
-#define ACTIVATION_FUNC(x) (x)
-#endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
-
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE)
-
-#if defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
-
-#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
-
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
-
-#if defined(WEIGHTS_PROMOTED_TYPE)
-#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
-
-#if CONV_STRIDE_X > 3
-#error "Stride X not supported"
-#endif /* CONV_STRIDE_X > 3 */
-
-#if !defined(IS_DOT8)
-
-#if DILATION_X == 1
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                        \
-    ({                                                                                                      \
-        int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                         \
-        int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \
-        \
-        left   = CONVERT(temp0.s01234567, int8);                                                            \
-        middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);                                  \
-        right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);                                  \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                 \
-    ({                                                                                               \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);               \
-        int temp1   = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \
-        \
-        left   = CONVERT(temp0.s02468ace, int8);                                                     \
-        middle = CONVERT(temp0.s13579bdf, int8);                                                     \
-        right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);                              \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                          \
-    ({                                                                                                        \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                        \
-        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \
-        \
-        left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                    \
-        middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);                                    \
-        right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);                                    \
-    })
-#endif /* CONV_STRIDE_X */
-
-#else /* DILATION_X == 1 */
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                                 \
-    ({                                                                                                               \
-        left   = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8);                                      \
-        middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8);     \
-        right  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                                  \
-    ({                                                                                                                \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                \
-        left        = CONVERT(temp0.s02468ace, int8);                                                                 \
-        \
-        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);    \
-        middle = CONVERT(temp0.s02468ace, int8);                                                                      \
-        \
-        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \
-        right = CONVERT(temp0.s02468ace, int8);                                                                       \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                                       \
-    ({                                                                                                                     \
-        int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16);                                     \
-        int8 temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8);              \
-        left        = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                            \
-        \
-        temp0  = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16);         \
-        temp1  = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8);    \
-        middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                 \
-        \
-        temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16);      \
-        temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \
-        right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                                                  \
-    })
-
-#endif /* CONV_STRIDE_X */
-#endif /* DILATION_X==1 */
-
-/** This function computes the depthwise convolution quantized.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-
-__kernel void dwc_3x3_native_quantized8_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Vector   output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
-    Vector   output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    int bias_value = *((__global int *)(vector_offset(&biases, channel)));
-#endif //defined(HAS_BIAS)
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
-    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, channel));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    int8 values0 = 0;
-    int8 sum0    = 0;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    int8 values1 = 0;
-    int8 sum1    = 0;
-#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
-
-    // Row0
-    int8 left, middle, right;
-    GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w0.s0);
-    values0 += middle * (int8)(w0.s1);
-    values0 += right * (int8)(w0.s2);
-
-#if WEIGHTS_OFFSET != 0
-    sum0 += left + middle + right;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-    // Row1
-    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w1.s0);
-    values0 += middle * (int8)(w1.s1);
-    values0 += right * (int8)(w1.s2);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += left * (int8)(w0.s0);
-    values1 += middle * (int8)(w0.s1);
-    values1 += right * (int8)(w0.s2);
-#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
-
-#if WEIGHTS_OFFSET != 0
-    int8 tmp = left + middle + right;
-    sum0 += tmp;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    sum1 += tmp;
-#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-    // Row2
-    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
-    values0 += left * (int8)(w2.s0);
-    values0 += middle * (int8)(w2.s1);
-    values0 += right * (int8)(w2.s2);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += left * (int8)(w1.s0);
-    values1 += middle * (int8)(w1.s1);
-    values1 += right * (int8)(w1.s2);
-#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
-
-#if WEIGHTS_OFFSET != 0
-    tmp = left + middle + right;
-    sum0 += tmp;
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    // Row3
-    GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
-    values1 += left * (int8)(w2.s0);
-    values1 += middle * (int8)(w2.s1);
-    values1 += right * (int8)(w2.s2);
-
-#if WEIGHTS_OFFSET != 0
-    sum1 += left + middle + right;
-#endif /* WEIGHTS_OFFSET != 0 */
-#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
-
-#if defined(HAS_BIAS)
-    values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
-#endif //defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    VEC_WEIGHTS_PROMOTED_TYPE(3)
-    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
-
-    WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
-    values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res0_shift_lt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    int8 res0_shift_gt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    values0                            = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values0 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-#if defined(REAL_MULTIPLIER)
-
-    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res1_shift_lt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    int8 res1_shift_gt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    values1                  = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values1 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-}
-
-#else // !defined(IS_DOT8)
-
-#if DILATION_X == 1
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                    \
-    ({                                                                                  \
-        VEC_TYPE(8)                                                                     \
-        temp0 = vload8(0, (__global DATA_TYPE *)(first_value));                         \
-        VEC_TYPE(2)                                                                     \
-        temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \
-        \
-        left   = temp0.s01234567;                                                       \
-        middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0);                      \
-        right  = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01);                      \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                       \
-    ({                                                                                     \
-        VEC_TYPE(16)                                                                       \
-        temp0           = vload16(0, (__global DATA_TYPE *)(first_value));                 \
-        DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
-        \
-        left   = temp0.s02468ace;                                                          \
-        middle = temp0.s13579bdf;                                                          \
-        right  = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1);                            \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                     \
-    ({                                                                                   \
-        VEC_TYPE(16)                                                                     \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                         \
-        VEC_TYPE(8)                                                                      \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
-        \
-        left   = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                       \
-        middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036);                       \
-        right  = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147);                       \
-    })
-#endif /* CONV_STRIDE_X */
-#else  /*DILATION_X==1*/
-
-#if CONV_STRIDE_X == 1
-#define GET_VALUES(first_value, left, middle, right)                                                  \
-    ({                                                                                                \
-        left   = vload8(0, (__global DATA_TYPE *)(first_value));                                      \
-        middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
-        right  = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
-    })
-#elif CONV_STRIDE_X == 2
-#define GET_VALUES(first_value, left, middle, right)                                                   \
-    ({                                                                                                 \
-        VEC_TYPE(16)                                                                                   \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value));                                      \
-        left   = temp0.s02468ace;                                                                      \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));     \
-        middle = temp0.s02468ace;                                                                      \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
-        right  = temp0.s02468ace;                                                                      \
-    })
-#else /* CONV_STRIDE_X */
-#define GET_VALUES(first_value, left, middle, right)                                                        \
-    ({                                                                                                      \
-        VEC_TYPE(16)                                                                                        \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value));                                            \
-        VEC_TYPE(8)                                                                                         \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))));                   \
-        left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                            \
-        \
-        temp0  = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE)));          \
-        temp1  = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE)));    \
-        middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                          \
-        \
-        temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE)));       \
-        temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \
-        right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25);                                           \
-    })
-
-#endif /* CONV_STRIDE_X */
-#endif /*DILATION_X==1*/
-/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-
-__kernel void dwc_3x3_native_quantized8_dot8_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif //defined(HAS_BIAS)
-)
-{
-    Image    src                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image    dst                = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights            = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Vector   output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
-    Vector   output_shifts      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
-
-    // Extract channel and linearized batch indices
-    const int channel = get_global_id(2) % DST_CHANNELS;
-    const int batch   = get_global_id(2) / DST_CHANNELS;
-
-#if defined(HAS_BIAS)
-    Vector    biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
-#endif //defined(HAS_BIAS)
-
-    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
-    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
-    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
-
-    VEC_TYPE(3)
-    w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
-    VEC_TYPE(3)
-    w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
-    VEC_TYPE(3)
-    w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, 0));
-
-    VEC_TYPE(8)
-    left0, middle0, right0;
-    VEC_TYPE(8)
-    left1, middle1, right1;
-    VEC_TYPE(8)
-    left2, middle2, right2;
-
-    int8 values0 = 0;
-    int8 sum0    = 0;
-
-    GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
-    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
-    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
-
-#if WEIGHTS_OFFSET != 0
-    sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
-    sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
-    sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    // If conv_stride_y is equals to 1, we compute two output rows
-
-    VEC_TYPE(8)
-    left3, middle3, right3;
-    int8 values1 = 0;
-    int8 sum1    = 0;
-
-    GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
-
-#if WEIGHTS_OFFSET != 0
-    sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
-    sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
-    sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
-#endif /* WEIGHTS_OFFSET != 0 */
-#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
-
-    ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
-    ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
-    values0.s0 += right2.s0 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
-    ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
-    values0.s1 += right2.s1 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
-    ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
-    values0.s2 += right2.s2 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
-    ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
-    values0.s3 += right2.s3 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
-    ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
-    values0.s4 += right2.s4 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
-    ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
-    values0.s5 += right2.s5 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
-    ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
-    values0.s6 += right2.s6 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
-    ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
-    values0.s7 += right2.s7 * w2.s2;
-
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
-    ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
-    values1.s0 += right3.s0 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
-    ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
-    values1.s1 += right3.s1 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
-    ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
-    values1.s2 += right3.s2 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
-    ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
-    values1.s3 += right3.s3 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
-    ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
-    values1.s4 += right3.s4 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
-    ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
-    values1.s5 += right3.s5 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
-    ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
-    values1.s6 += right3.s6 * w2.s2;
-
-    ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
-    ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
-    values1.s7 += right3.s7 * w2.s2;
-#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
-
-#if defined(HAS_BIAS)
-    values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif //defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    WEIGHTS_PROMOTED_TYPE sum_weights = 0;
-    VEC_WEIGHTS_PROMOTED_TYPE(3)
-    tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
-    sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
-    values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-    values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
-    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values0 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res0 = CONVERT_SAT(values0, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
-
-#if defined(REAL_MULTIPLIER)
-
-    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
-    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    values1 += (int8)OUTPUT_OFFSET;
-    VEC_TYPE(8)
-    res1 = CONVERT_SAT(values1, VEC_TYPE(8));
-
-    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
-}
-
-#endif // !defined(IS_DOT8)
-
-#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
-
-#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
-
-#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
-
-#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT)
-
-#if WEIGHTS_OFFSET != 0
-#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
-    ({                                          \
-        sum += CONVERT(x, VEC_INT);             \
-        MULTIPLY_ADD(x, y, acc);                \
-    })
-#else /* WEIGHTS_OFFSET != 0 */
-#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
-    ({                                                                                 \
-        ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc);                 \
-        ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc);                 \
-        acc += val8 * w1;                                                              \
-    })
-
-#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
-    ({                                                                                   \
-        sum = val0;                                                                      \
-        ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum);             \
-        ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum);             \
-    })
-
-#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
-    ({                                             \
-        sum = w1;                                  \
-        ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum);    \
-        ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum);    \
-    })
-
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
- *
- * @note This kernel assumes VEC_SIZE is 4.
- * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- * @param[in] max_offset                                       Max offset for the input tensor
- */
-__kernel void dwc_3x3_reshaped_quantized8_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    IMAGE_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    int max_offset)
-{
-    const int x = get_global_id(0); // channels
-    const int y = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int      z                         = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr           = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
-#endif /* defined(DST_DEPTH) */
-
-    int  z_coord = 0;
-    int4 offset  = 0;
-    int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
-
-    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
-    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
-    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
-    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
-    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
-
-    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
-
-    // We compute VEC_SIZEx1x1 [C,W,H] elements
-    VEC_INT acc = 0, sum = 0;
-
-    // Load weights
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
-    w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
-    w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w0 = w0_tmp.s0123;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w1 = w0_tmp.s4567;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w2 = w0_tmp.s89AB;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w3 = w0_tmp.sCDEF;
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w4 = w1_tmp.s0123;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w5 = w1_tmp.s4567;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w6 = w1_tmp.s89AB;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w7 = w1_tmp.sCDEF;
-
-#if INPUT_OFFSET != 0
-    VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
-                     + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
-                     + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
-#endif /* INPUT_OFFSET != 0 */
-
-    // Load input values
-    // z == 0
-    // Clamp z_coord as for z = 0, it can be negative
-    // z_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-
-    VEC_TYPE(VEC_SIZE)
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-
-    // z == 1
-    // z_coord can be only negative for z = 0 so we do not need to clamp it
-    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_TYPE(VEC_SIZE)
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-
-    // z == 2
-    // Offset can be out-of-bound so we need to check if it is greater than max_offset
-    z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-    VEC_TYPE(VEC_SIZE)
-    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-
-    MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
-
-    MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
-
-    MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
-    MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
-
-#if defined(HAS_BIAS)
-    Vector  biases      = CONVERT_TO_VECTOR_STRUCT(biases);
-    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
-    acc += bias_values;
-#endif // defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    acc += WEIGHTS_OFFSET * sum;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    acc += INPUT_OFFSET * sum_we;
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    acc += (VEC_INT)K_OFFSET;
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
-    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
-    VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
-    VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-
-    VEC_INT res_shift_lt0              = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);
-    VEC_INT res_shift_gt0              = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
-    acc                                = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    acc     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-#else  // OUTPUT_SHIFT < 0
-    acc     = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-#endif // OUTPUT_SHIFT < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    acc += (VEC_INT)OUTPUT_OFFSET;
-
-    VEC_TYPE(VEC_SIZE)
-    res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE));
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
-#endif /* defined(DST_DEPTH) */
-
-    VSTORE(VEC_SIZE)
-    (ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr));
-}
-#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
- *
- * @note This kernel assumes VEC_SIZE is 4.
- * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
- * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- * @param[in] max_offset                                       Max offset for the input tensor
- */
-
-__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    IMAGE_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    int max_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int             z                  = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr           = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
-#endif /* defined(DST_DEPTH) */
-
-    int  z_coord = 0;
-    int4 offset  = 0;
-    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
-
-    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
-    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
-    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
-    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
-    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
-
-    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
-
-    // We compute 4x2x2 [C,W,H] elements
-    VEC_INT acc0 = 0, sum0 = 0;
-    VEC_INT acc1 = 0, sum1 = 0;
-    VEC_INT acc2 = 0, sum2 = 0;
-    VEC_INT acc3 = 0, sum3 = 0;
-
-    // Load weights
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
-    w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 16)
-    w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w0 = w0_tmp.s0123;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w1 = w0_tmp.s4567;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w2 = w0_tmp.s89AB;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w3 = w0_tmp.sCDEF;
-
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w4 = w1_tmp.s0123;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w5 = w1_tmp.s4567;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w6 = w1_tmp.s89AB;
-    VEC_DATA_TYPE(WEIGHTS_TYPE, 4)
-    w7 = w1_tmp.sCDEF;
-
-#if INPUT_OFFSET != 0
-    VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
-                     + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
-                     + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
-#endif /* INPUT_OFFSET != 0 */
-
-    // Load input values
-    // z == 0
-    // Clamp z_coord as for z = 0, it can be negative
-    // z_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
-    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-
-    VEC_TYPE(VEC_SIZE)
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 1
-    // z_coord can be only negative for z = 0 so we do not need to clamp it
-    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_TYPE(VEC_SIZE)
-    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 2
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
-    offset = min(offset, (int4)max_offset);
-    VEC_TYPE(VEC_SIZE)
-    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 3
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)(src_stride_z);
-    offset = min(offset, (int4)max_offset);
-    VEC_TYPE(VEC_SIZE)
-    values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
-
-    MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
-
-    MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
-    MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
-    MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
-
-    MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
-
-    MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
-
-    MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
-    MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
-    MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
-
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
-
-    acc0 += bias_values;
-    acc1 += bias_values;
-    acc2 += bias_values;
-    acc3 += bias_values;
-#endif /* defined(HAS_BIAS) */
-
-#if WEIGHTS_OFFSET != 0
-    acc0 += WEIGHTS_OFFSET * sum0;
-    acc1 += WEIGHTS_OFFSET * sum1;
-    acc2 += WEIGHTS_OFFSET * sum2;
-    acc3 += WEIGHTS_OFFSET * sum3;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    VEC_INT offs = INPUT_OFFSET * sum_we;
-
-    acc0 += offs;
-    acc1 += offs;
-    acc2 += offs;
-    acc3 += offs;
-#endif /* INPUT_OFFSET != 0 */
-
-#if K_OFFSET != 0
-    acc0 += (VEC_INT)K_OFFSET;
-    acc1 += (VEC_INT)K_OFFSET;
-    acc2 += (VEC_INT)K_OFFSET;
-    acc3 += (VEC_INT)K_OFFSET;
-#endif /* K_OFFSET != 0 */
-
-#if defined(REAL_MULTIPLIER)
-
-    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-    acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-    acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
-    Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
-    VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
-    VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-
-    VEC_INT res0_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
-    VEC_INT res1_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
-    VEC_INT res2_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
-    VEC_INT res3_shift_lt0   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
-    VEC_INT res0_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
-    VEC_INT res1_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
-    VEC_INT res2_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
-    VEC_INT res3_shift_gt0   = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
-    acc0                     = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
-    acc1                     = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
-    acc2                     = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
-    acc3                     = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-    acc0    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-    acc1    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-    acc2    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-    acc3    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-#else  // OUTPUT_SHIFT < 0
-    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc2    = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc3    = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-#endif // OUTPUT_SHIFT < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#endif // defined(REAL_MULTIPLIER)
-
-    acc0 += (VEC_INT)OUTPUT_OFFSET;
-    acc1 += (VEC_INT)OUTPUT_OFFSET;
-    acc2 += (VEC_INT)OUTPUT_OFFSET;
-    acc3 += (VEC_INT)OUTPUT_OFFSET;
-
-    VEC_TYPE(VEC_SIZE)
-    res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));
-    VEC_TYPE(VEC_SIZE)
-    res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));
-    VEC_TYPE(VEC_SIZE)
-    res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE));
-    VEC_TYPE(VEC_SIZE)
-    res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE));
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
-#endif /* defined(DST_DEPTH) */
-
-    VSTORE(VEC_SIZE)
-    (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
-    VSTORE(VEC_SIZE)
-    (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
-
-#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
-#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    {
-        VSTORE(VEC_SIZE)
-        (ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
-        VSTORE(VEC_SIZE)
-        (ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
-    }
-}
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
- *
- * @note Per-channel quantization is not supported by this kernel.
- * @note This kernel assumes VEC_SIZE is 4.
- * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
- * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
- * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
- * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
- *       If not, the quantization will be performed using a fixed point multiplication
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- * @param[in] max_offset                                       The maximum allowed offset for the input tensor
- */
-__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    IMAGE_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
-#endif // defined(HAS_BIAS)
-    int max_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int      z               = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
-
-#if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
-#endif /* defined(DST_DEPTH) */
-
-    int  z_coord = 0;
-    int4 offset  = 0;
-    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
-
-    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
-    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
-    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
-    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
-    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
-
-    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
-
-    // We compute 4x2x1 [C,W,H] elements
-    VEC_INT acc0 = 0;
-    VEC_INT acc1 = 0;
-    VEC_INT sum0 = 0;
-    VEC_INT sum1 = 0;
-
-    // Load weights
-    VEC_TYPE(16)
-    w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
-    VEC_TYPE(16)
-    w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
-    VEC_TYPE(4)
-    w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32));
-
-#if INPUT_OFFSET != 0
-    // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
-    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
-    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
-    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
-    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
-
-    // Multiply the weights reduction with INPUT_OFFSET
-    acc0 = INPUT_OFFSET * acc0;
-
-    acc1 = acc0;
-#endif // INPUT_OFFSET != 0
-
-    // Load input values
-    // z == 0
-    // Clamp z_coord as for z = 0, it can be negative
-    // z_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z - (int)CONV_PAD_TOP;
-    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-
-    VEC_TYPE(VEC_SIZE)
-    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 1
-    // z_coord can be only negative for z = 0 so we do not need to clamp it
-    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord = z - (int)CONV_PAD_TOP + 1;
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_TYPE(VEC_SIZE)
-    values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 2
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
-    offset = min(offset, (int4)max_offset);
-    VEC_TYPE(VEC_SIZE)
-    values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_TYPE(VEC_SIZE)
-    values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_TYPE(VEC_SIZE)
-    values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_TYPE(VEC_SIZE)
-    values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
-    DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
-    DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
-    DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
-
-    DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
-    DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
-    DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
-    DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
-
-    DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
-    DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
-    DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
-    DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
-
-    DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
-    DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
-    DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
-    DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
-
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
-
-    acc0 += bias_values;
-    acc1 += bias_values;
-
-#endif // defined(HAS_BIAS)
-
-#if WEIGHTS_OFFSET != 0
-    acc0 += WEIGHTS_OFFSET * sum0;
-    acc1 += WEIGHTS_OFFSET * sum1;
-#endif // WEIGHTS_OFFSET != 0
-
-#if K_OFFSET != 0
-    acc0 += (VEC_INT)K_OFFSET;
-    acc1 += (VEC_INT)K_OFFSET;
-
-#endif // K_OFFSET != 0
-
-#if defined(REAL_MULTIPLIER)
-
-    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
-
-#else // defined(REAL_MULTIPLIER)
-
-#if OUTPUT_SHIFT < 0
-    acc0                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-    acc1                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
-#else  // OUTPUT_SHIFT < 0
-    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-#endif // OUTPUT_SHIFT < 0
-
-#endif // defined(REAL_MULTIPLIER)
-    acc0 += (VEC_INT)OUTPUT_OFFSET;
-    acc1 += (VEC_INT)OUTPUT_OFFSET;
-
-    VEC_TYPE(VEC_SIZE)
-    res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));
-    VEC_TYPE(VEC_SIZE)
-    res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));
-
-#if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
-#endif /* defined(DST_DEPTH) */
-
-    VSTORE(VEC_SIZE)
-    (ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-}
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
-
-#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
-
-#endif // defined(WEIGHTS_PROMOTED_TYPE)
-
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
-
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
-/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
- *
- * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
- * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
- * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
- * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
- * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
- * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
- * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
- * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                                       src_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                                     Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                                       src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y                                   weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                                 Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                                   weights_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes            The offset of the first element in the weights tensor
- * @param[in] output_multipliers_ptr                           Pointer to the output multipliers vector. Supported data types: S32
- * @param[in] output_multipliers_stride_x                      Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in] output_multipliers_step_x                        output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
- * @param[in] output_shifts_ptr                                Pointer to the output shifts vector. Supported data types: S32
- * @param[in] output_shifts_stride_x                           Stride of the output shifts vector in X dimension (in bytes)
- * @param[in] output_shifts_step_x                             output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_shifts_offset_first_element_in_bytes      The offset of the first element in the output shifts vector
- * @param[in] biases_ptr                                       (Optional) Pointer to the biases vector. Supported data types: S32
- * @param[in] biases_stride_x                                  (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in] biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases vector
- */
-__kernel void dwc_MxN_native_quantized8_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-    VECTOR_DECLARATION(output_multipliers),
-    VECTOR_DECLARATION(output_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(HAS_BIAS)
-)
-{
-    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);
-    int y      = get_global_id(1); // spatial coordinate x
-#if defined(DST_DEPTH)
-    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
-    int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else                                          // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif                                         // defined(DST_DEPTH)
-
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
-
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
-
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER;
-
-#if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-#endif // defined(HAS_BIAS)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-    __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#if defined(DST_DEPTH)
-    s_addr += b * src_stride_w;
-    d_addr += b * dst_stride_w;
-#endif // defined(DST_DEPTH)
-
-#if DEPTH_MULTIPLIER > 1
-    for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
-    {
-#endif // DEPTH_MULTIPLIER > 1
-        // Each work-item computes N0x1x1 elements
-        VEC_INT res = 0;
-
-        int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
-        int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-
-        for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-        {
-            if(y_coord >= 0 && y_coord < SRC_DIM2)
-            {
-                int x_coord_tmp = x_coord;
-
-                for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-                {
-                    if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
-                    {
-                        int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
-                        int w_offset = xk * weights_stride_y + yk * weights_stride_z;
-
-                        // Load input and weights values
-                        VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);
-                        VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);
-
-                        res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);
-                    }
-                    x_coord_tmp += DILATION_X;
-                }
-            }
-            y_coord += DILATION_Y;
-        }
-
-#if defined(HAS_BIAS)
-        VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));
-        res += bias;
-#endif // defined(HAS_BIAS)
-
-#if defined(PER_CHANNEL_QUANTIZATION)
-        VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
-        VEC_INT output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
-
-        VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
-        VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
-        res                   = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-#if OUTPUT_SHIFT < 0
-        res   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
-#else  // OUTPUT_SHIFT < 0
-        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
-#endif // OUTPUT_OFFSET < 0
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-        res += (VEC_INT)OUTPUT_OFFSET;
-
-        VEC_TYPE(VEC_SIZE)
-        res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
-        res0 = ACTIVATION_FUNC(res0);
-
-        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-
-#if DEPTH_MULTIPLIER > 1
-        w_addr += sizeof(WEIGHTS_TYPE);
-        d_addr += sizeof(DATA_TYPE);
-#if defined(PER_CHANNEL_QUANTIZATION)
-        out_mul_addr += sizeof(int);
-        out_shift_addr += sizeof(int);
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-#if defined(HAS_BIAS)
-        b_addr += sizeof(int);
-#endif // defined(HAS_BIAS)
-    }
-#endif // DEPTH_MULTIPLIER > 1
-}
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
-#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
deleted file mode 100644
index 127f67d940..0000000000
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-/** This performs the dequantization of 8-bit unsigned integers to floating point.
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void dequantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = SCALE;
-
-    const VEC_DATA_TYPE(int, VEC_SIZE)
-    voffset = OFFSET;
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = scale[get_global_id(2)];
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
deleted file mode 100644
index dddbb4d615..0000000000
--- a/src/core/CL/cl_kernels/derivative.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel that computes the first-order derivative.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void derivative(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
-    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
-    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
-    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
-    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
deleted file mode 100644
index 14362c1f31..0000000000
--- a/src/core/CL/cl_kernels/dilate.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function dilates an input image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void dilate(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
deleted file mode 100644
index d0eea5bfb4..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(DATA_LAYOUT_NHWC)
-
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-/** This kernel performs a direct convolution to convolve the low three dimensions of a tensor with data layout NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* defined(HAS_BIAS) */
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values        = 0;
-    const int id0 = get_global_id(0);
-    const int id1 = get_global_id(1);
-    const int id2 = get_global_id(2);
-    weights.ptr += id0 * weights_stride_w;
-    __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + id2 * STRIDE_Y * (int)src_stride_z;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
-#if STRIDE_X == 1
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
-                   PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 1 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 3 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 5 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 7 * src_stride_y, DATA_TYPE));
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        col0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(
-                   PTR_TO_VALUE(src_addr + 0 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 2 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 4 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 6 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 8 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 10 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 12 * src_stride_y, DATA_TYPE),
-                   PTR_TO_VALUE(src_addr + 14 * src_stride_y, DATA_TYPE));
-#else               /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-        values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, col0));
-
-        src_addr += src_stride_x;
-        weights.ptr += weights_stride_x;
-    }
-
-#ifdef HAS_BIAS
-    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));
-#endif /* defined(HAS_BIAS) */
-
-    *((__global DATA_TYPE *)dst.ptr)                      = values.s0;
-    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;
-    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;
-    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;
-    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;
-    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;
-    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;
-    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
-}
-#endif // defined(DATA_LAYOUT_NHWC)
-
-#if STRIDE_X == 3
-#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
-#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
-#elif STRIDE_X == 2
-#define INPUT_PIXEL(data_size) extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_PIXEL(data_size) extract_input_stride1
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X == 3 */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
-{
-    return vload8(0, input_pixel);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_pixel);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp1 = vload4(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp2 = vload4(0, input_pixel + 6);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp3 = vload4(0, input_pixel + 12);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp4 = vload4(0, input_pixel + 18);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp1 = vload8(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp2 = vload8(0, input_pixel + 8);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp3 = vload8(0, input_pixel + 16);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_pixel + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* defined(HAS_BIAS) */
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values = 0;
-
-    const uint z_index = get_global_id(2);
-
-    weights.ptr += z_index * weights_stride_w;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
-        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
-        src.ptr += src_stride_z;
-        weights.ptr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
-    ({                                                 \
-        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
-        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
-        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
-        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
-    })
-
-/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 acc0 = 0.0f;
-    float4 acc1 = 0.0f;
-    float4 acc2 = 0.0f;
-    float4 acc3 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float weight = *((__global float *)weights_addr);
-
-        // Load values from row0 of input tensor
-        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
-        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
-        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
-        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    acc0.s0 += bias;
-    acc0.s1 += bias;
-    acc0.s2 += bias;
-    acc0.s3 += bias;
-    acc1.s0 += bias;
-    acc1.s1 += bias;
-    acc1.s2 += bias;
-    acc1.s3 += bias;
-    acc2.s0 += bias;
-    acc2.s1 += bias;
-    acc2.s2 += bias;
-    acc2.s3 += bias;
-    acc3.s0 += bias;
-    acc3.s1 += bias;
-    acc3.s2 += bias;
-    acc3.s3 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
deleted file mode 100644
index da7a1e7410..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
-    ({                                                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
-        src0 = vload8(0, src_row_ptr);                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
-        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
-    ({                                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
-        src0           = vload16(0, src_row_ptr);                                                                                               \
-        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-#if defined(DATA_LAYOUT_NHWC)
-
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x3_STRIDE_NHWC_STRIDE1(acc, row_ptr, weights_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x3_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x3_STRIDE_NHWC_STRIDE2(acc, row_ptr, weights_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x3_STRIDE_NHWC_STRIDE1(acc, row_ptr, weights_ptr)                                                                      \
-    {                                                                                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                        \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                              \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                        \
-        src1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(                                                                                              \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE),                                                                           \
-                PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE));                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                        \
-        weights = (VEC_DATA_TYPE(DATA_TYPE, 3))(                                                                                           \
-                  PTR_TO_VALUE((weights_ptr) + 0 * weights_stride_y, DATA_TYPE),                                                                 \
-                  PTR_TO_VALUE((weights_ptr) + 1 * weights_stride_y, DATA_TYPE),                                                                 \
-                  PTR_TO_VALUE((weights_ptr) + 2 * weights_stride_y, DATA_TYPE));                                                                \
-        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s0));                                                          \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s1)); \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s2)); \
-    }
-
-#define CONVOLUTION1x3_STRIDE_NHWC_STRIDE2(acc, row_ptr, weights_ptr)                                                                   \
-    {                                                                                                                                   \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                    \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                                                                        \
-                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE),                                                                       \
-                PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                                                                       \
-                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE),                                                                       \
-                PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                                                                       \
-                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE),                                                                       \
-                PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));                                                                      \
-        DATA_TYPE src1 = PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE);                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                     \
-        weights = (VEC_DATA_TYPE(DATA_TYPE, 3))(                                                                                        \
-                  PTR_TO_VALUE((weights_ptr) + 0 * weights_stride_y, DATA_TYPE),                                                              \
-                  PTR_TO_VALUE((weights_ptr) + 1 * weights_stride_y, DATA_TYPE),                                                              \
-                  PTR_TO_VALUE((weights_ptr) + 2 * weights_stride_y, DATA_TYPE));                                                             \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0.s02468ACE, (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s0));                                             \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s1));      \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights.s2)); \
-    }
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note This OpenCL kernel works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values0       = 0;
-    const int id0 = get_global_id(0);
-    const int id1 = get_global_id(1);
-    const int id2 = get_global_id(2);
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
-
-    weights_addr += id0 * weights_stride_w;
-
-    const int coordy = ((id2 * STRIDE_Y) - PAD_TOP);
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if PAD_TOP > 0
-        if(coordy < 0) // special case Z = -1 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
-        }
-        else if(coordy == (SRC_HEIGHT - PAD_TOP - 1))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
-            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
-            CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
-        }
-#else  // PAD_TOP > 0
-        CONVOLUTION1x3_NHWC(values0, src_addr, (weights_addr + 0 * (int)weights_stride_z));
-        CONVOLUTION1x3_NHWC(values0, src_addr + 1 * (int)src_stride_z, (weights_addr + 1 * (int)weights_stride_z));
-        CONVOLUTION1x3_NHWC(values0, src_addr + 2 * (int)src_stride_z, (weights_addr + 2 * (int)weights_stride_z));
-#endif // PAD_TOP > 0
-        src_addr += src_stride_x;
-        weights_addr += weights_stride_x;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    values0       = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0))));
-#endif /* defined(HAS_BIAS) */
-
-    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;
-    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;
-    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;
-    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;
-    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;
-    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;
-    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;
-    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;
-}
-#endif // defined(DATA_LAYOUT_NHWC)
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note This OpenCL kernel works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
-    ({                                                        \
-        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
-        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
-        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
-        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
-        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
-        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
-        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
-        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
-        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
-        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
-        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
-        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
-    })
-
-/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0;
-    float4 values1 = 0;
-    float4 values2 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        float4 src0;
-        float2 src1;
-
-        // Load values from row0 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
-
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
-
-        // Load values from row1 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
-
-        // Load values from row2 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
-
-        // Load values from row3 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
-
-        // Row4
-        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += (float4)bias;
-    values1 += (float4)bias;
-    values2 += (float4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
deleted file mode 100644
index e5c7a5107d..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ /dev/null
@@ -1,549 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
-        src0 = vload8(0, src_row_ptr);                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
-        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
-        src0 = vload16(0, src_row_ptr);                                                                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
-        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-#if defined(DATA_LAYOUT_NHWC)
-
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x5_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x5_STRIDE1_NHWC(acc, row_ptr, weights_ptr)                                                                         \
-    ({                                                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
-        src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
-        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                               \
-                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
-        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE);                                        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                                 \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
-    })
-
-#define CONVOLUTION1x5_STRIDE2_NHWC(acc, row_ptr, weights_ptr)                                                                         \
-    ({                                                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                   \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                         \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
-        src1 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                    \
-        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(                                                                               \
-                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE)); \
-        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE);                                        \
-        acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                       \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;                \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
-        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values0 = 0;
-
-    const int id0 = get_global_id(0);
-    const int id1 = get_global_id(1);
-    const int id2 = get_global_id(2);
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
-
-    weights_addr += id0 * weights_stride_w;
-
-#if(PAD_TOP == 1)
-    const int coordy = id2 - PAD_TOP;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        if(coordy < 0) // special case Z = -1 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        }
-        else if(coordy == (DST_HEIGHT - PAD_TOP - 1))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        }
-        src_addr += src_stride_x;
-        weights_addr += weights_stride_x;
-    }
-#elif(PAD_TOP == 2)
-    const int coordy = id2 * STRIDE_Y;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        if(coordy == 0) // special case Z = -2 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        }
-        else if(coordy == 1) // special case Z = -1 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        }
-        else if(coordy == (SRC_HEIGHT - 1))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-        }
-        else if(coordy == (SRC_HEIGHT - 2))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        }
-        src_addr += src_stride_x;
-        weights_addr += weights_stride_x;
-    }
-
-#else  /*  PAD_TOP == 2 */
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
-        CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-        CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-        CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-        CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        src_addr += src_stride_x;
-        weights_addr += weights_stride_x;
-    }
-#endif /*  PAD_TOP == 1 */
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));
-#endif /* defined(HAS_BIAS) */
-
-    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values0.s0;
-    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values0.s1;
-    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values0.s2;
-    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values0.s3;
-    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values0.s4;
-    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values0.s5;
-    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values0.s6;
-    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values0.s7;
-}
-
-#endif // defined(DATA_LAYOUT_NHWC)
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
-    ({                                                                  \
-        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
-        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
-        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
-        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
-        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
-        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
-        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
-        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
-        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
-        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
-        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
-        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
-        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
-        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
-        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
-        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
-        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
-        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
-        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
-        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
-    })
-
-/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0.0f;
-    float4 values1 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights from row0 and row1
-        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
-        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
-        float8 src0;
-
-        // Load values from row0 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-
-        // Load values from row1 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row2 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
-
-        // Load weights from row2
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row3 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        // Load weights from row3
-        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
-        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row4 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
-
-        // Load weights from row4
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
-
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row5 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += bias;
-    values1 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution9x9.cl b/src/core/CL/cl_kernels/direct_convolution9x9.cl
deleted file mode 100644
index 64da38d64d..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution9x9.cl
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP)
-
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-#define CONVOLUTION1x9_STRIDE1_NHWC(acc, row_ptr, weights_ptr)                                                                         \
-    ({                                                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE));                 \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        src1 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                               \
-                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 5 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 6 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 7 * weights_stride_y, DATA_TYPE)); \
-        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 8 * weights_stride_y, DATA_TYPE);                                        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                                 \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s4567, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s4;                \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s567, src1.s0123, src1.s4) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s5;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s67, src1.s012, src1.s345) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s6;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s7, src1.s0123, src1.s456) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s7;        \
-        acc += src1 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;                                                                     \
-    })
-
-#define CONVOLUTION1x9_STRIDE2_NHWC(acc, row_ptr, weights_ptr)                                                                         \
-    ({                                                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                   \
-        src0 = (VEC_DATA_TYPE(DATA_TYPE, 16))(                                                                                         \
-                PTR_TO_VALUE(row_ptr + 0 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 1 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 2 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 3 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 4 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 5 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 6 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 7 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 8 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 9 * src_stride_y, DATA_TYPE),                  \
-                PTR_TO_VALUE(row_ptr + 10 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 11 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 12 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 13 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 14 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 15 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        src1 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                                          \
-                PTR_TO_VALUE(row_ptr + 16 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 17 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 18 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 19 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 20 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 21 * src_stride_y, DATA_TYPE),                \
-                PTR_TO_VALUE(row_ptr + 22 * src_stride_y, DATA_TYPE), PTR_TO_VALUE(row_ptr + 23 * src_stride_y, DATA_TYPE));               \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                    \
-        weights_values0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(                                                                               \
-                          PTR_TO_VALUE(weights_ptr + 0 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 1 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 2 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 3 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 4 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 5 * weights_stride_y, DATA_TYPE),  \
-                          PTR_TO_VALUE(weights_ptr + 6 * weights_stride_y, DATA_TYPE), PTR_TO_VALUE(weights_ptr + 7 * weights_stride_y, DATA_TYPE)); \
-        DATA_TYPE weights_value1 = PTR_TO_VALUE(weights_ptr + 8 * weights_stride_y, DATA_TYPE);                                        \
-        acc += src0.s02468ACE * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                       \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;                \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468A, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s4;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s579, src0.sBDF, src1.s13) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s5;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s68A, src0.sCE, src1.s024) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s6;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s79B, src0.sDF, src1.s135) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s7;        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s8AC, src0.sE, src1.s0246) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;            \
-    })
-
-#if defined(VEC_SIZE)
-#define VFMA(acc, w, src0, src1, src2, src3, src4, src5, src6, src7) \
-    ({                                                               \
-        acc##0 = fma(src0, w, acc##0);                               \
-        acc##1 = fma(src1, w, acc##1);                               \
-        acc##2 = fma(src2, w, acc##2);                               \
-        acc##3 = fma(src3, w, acc##3);                               \
-        acc##4 = fma(src4, w, acc##4);                               \
-        acc##5 = fma(src5, w, acc##5);                               \
-        acc##6 = fma(src6, w, acc##6);                               \
-        acc##7 = fma(src7, w, acc##7);                               \
-    })
-
-#define CONVOLUTION1x9_STRIDE1_NHWC_BIFROST(acc, row_ptr, weights_ptr)                       \
-    ({                                                                                       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)row_ptr);                            \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + src_stride_y));           \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 2 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 3 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 4 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 5 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 6 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 7 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 8 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 9 * src_stride_y));       \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 10 * src_stride_y));     \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 11 * src_stride_y));     \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 12 * src_stride_y));     \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 13 * src_stride_y));     \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 14 * src_stride_y));     \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        src15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(row_ptr + 15 * src_stride_y));     \
-        \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 0 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 1 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 2 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 3 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 4 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 5 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 6 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 7 * weights_stride_y)); \
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)                                                   \
-        w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_ptr + 8 * weights_stride_y)); \
-        \
-        VFMA(acc, w0, src0, src1, src2, src3, src4, src5, src6, src7);                       \
-        VFMA(acc, w1, src1, src2, src3, src4, src5, src6, src7, src8);                       \
-        VFMA(acc, w2, src2, src3, src4, src5, src6, src7, src8, src9);                       \
-        VFMA(acc, w3, src3, src4, src5, src6, src7, src8, src9, src10);                      \
-        VFMA(acc, w4, src4, src5, src6, src7, src8, src9, src10, src11);                     \
-        VFMA(acc, w5, src5, src6, src7, src8, src9, src10, src11, src12);                    \
-        VFMA(acc, w6, src6, src7, src8, src9, src10, src11, src12, src13);                   \
-        VFMA(acc, w7, src7, src8, src9, src10, src11, src12, src13, src14);                  \
-        VFMA(acc, w8, src8, src9, src10, src11, src12, src13, src14, src15);                 \
-    })
-
-#if VEC_SIZE == 4
-#define REDUCE(out, vec)            \
-    ({                              \
-        VEC_DATA_TYPE(DATA_TYPE, 2) \
-        tmp1 = vec.s01 + vec.s23;   \
-        out  = tmp1.s0 + tmp1.s1;   \
-    })
-#else // VEC_SIZE == 4
-#error("Not supported")
-#endif // VEC_SIZE == 4
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1_NHWC_BIFROST(acc, row_ptr, weights_ptr)
-#else // STRIDE_X == 1
-#error "Not supported"
-#endif // STRIDE_X == 1
-
-#else // defined(VEC_SIZE)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1_NHWC(acc, row_ptr, weights_ptr)
-#elif STRIDE_X == 2 // STRIDE_X == 1
-#define CONVOLUTION1x9_NHWC(acc, row_ptr, weights_ptr) CONVOLUTION1x9_STRIDE2_NHWC(acc, row_ptr, weights_ptr)
-#else // STRIDE_X == 1
-#error "STRIDE_X larger than 2 is not supported"
-#endif // STRIDE_X == 1
-
-#endif // defined(VEC_SIZE)
-
-//#if defined(VEC_SIZE)
-/** This kernel performs a direct convolution to convolve the low three dimensions in a tensor with the NHWC data layout
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            (Optional) Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      (Optional) Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution9x9_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values = 0;
-
-#if defined(VEC_SIZE)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values0 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values1 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values2 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values3 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values4 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values5 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values6 = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values7 = 0;
-#define STEP_X (VEC_SIZE)
-#else // defined(VEC_SIZE)
-#define STEP_X (1)
-#endif // defined(VEC_SIZE)
-
-    const int id0 = get_global_id(0);
-    const int id1 = get_global_id(1);
-    const int id2 = get_global_id(2);
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
-
-    weights_addr += id0 * weights_stride_w;
-
-    const int coordy = (id2 * STRIDE_Y) - PAD_TOP;
-    if(coordy < 0)
-    {
-        // Skip first rows containing padding
-        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
-        {
-            const int start_z = -coordy;
-            for(int i = start_z; i < 9; ++i)
-            {
-                CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
-            }
-            src_addr += STEP_X * sizeof(DATA_TYPE);
-            weights_addr += STEP_X * sizeof(DATA_TYPE);
-        }
-    }
-    else if(coordy > (SRC_HEIGHT - 9))
-    {
-        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
-        {
-            // Avoid loading rows beyond the input height
-            const int end_z = SRC_HEIGHT - coordy;
-            for(int i = 0; i < end_z; ++i)
-            {
-                CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
-            }
-            src_addr += STEP_X * sizeof(DATA_TYPE);
-            weights_addr += STEP_X * sizeof(DATA_TYPE);
-        }
-    }
-    else
-    {
-        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
-        {
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-            src_addr += STEP_X * sizeof(DATA_TYPE);
-            weights_addr += STEP_X * sizeof(DATA_TYPE);
-        }
-    }
-
-#if defined(VEC_SIZE)
-    REDUCE(values.s0, values0);
-    REDUCE(values.s1, values1);
-    REDUCE(values.s2, values2);
-    REDUCE(values.s3, values3);
-    REDUCE(values.s4, values4);
-    REDUCE(values.s5, values5);
-    REDUCE(values.s6, values6);
-    REDUCE(values.s7, values7);
-#endif // defined(VEC_SIZE)
-
-#if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    values += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, id0)));
-#endif // defined(HAS_BIAS)
-
-    *((__global DATA_TYPE *)(dst.ptr + 0 * dst_stride_y)) = values.s0;
-    *((__global DATA_TYPE *)(dst.ptr + 1 * dst_stride_y)) = values.s1;
-    *((__global DATA_TYPE *)(dst.ptr + 2 * dst_stride_y)) = values.s2;
-    *((__global DATA_TYPE *)(dst.ptr + 3 * dst_stride_y)) = values.s3;
-    *((__global DATA_TYPE *)(dst.ptr + 4 * dst_stride_y)) = values.s4;
-    *((__global DATA_TYPE *)(dst.ptr + 5 * dst_stride_y)) = values.s5;
-    *((__global DATA_TYPE *)(dst.ptr + 6 * dst_stride_y)) = values.s6;
-    *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
-#undef STEP_X
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
deleted file mode 100644
index 8237fe1700..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ /dev/null
@@ -1,878 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#if defined(DATA_LAYOUT_NHWC)
-
-#if KERNEL_SIZE == 9
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9(acc, src_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1(acc, src_ptr, weights_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x9(acc, src_ptr, weights_ptr) CONVOLUTION1x9_STRIDE2(acc, src_ptr, weights_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x9_STRIDE1(acc, src_ptr, weights_ptr)                          \
-    ({                                                                             \
-        int8 weights_values0 = 0;                                                  \
-        int  weights_value1  = 0;                                                  \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y)); \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y)); \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y)); \
-        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y)); \
-        weights_values0.s4   = convert_int(*(weights_ptr + 4 * weights_stride_y)); \
-        weights_values0.s5   = convert_int(*(weights_ptr + 5 * weights_stride_y)); \
-        weights_values0.s6   = convert_int(*(weights_ptr + 6 * weights_stride_y)); \
-        weights_values0.s7   = convert_int(*(weights_ptr + 7 * weights_stride_y)); \
-        weights_value1       = convert_int(*(weights_ptr + 8 * weights_stride_y)); \
-        \
-        int8 src0 = 0;                                                             \
-        int8 src1 = 0;                                                             \
-        src0.s0   = convert_int(*(src_ptr + 0 * weights_stride_y));                \
-        src0.s1   = convert_int(*(src_ptr + 1 * weights_stride_y));                \
-        src0.s2   = convert_int(*(src_ptr + 2 * weights_stride_y));                \
-        src0.s3   = convert_int(*(src_ptr + 3 * weights_stride_y));                \
-        src0.s4   = convert_int(*(src_ptr + 4 * weights_stride_y));                \
-        src0.s5   = convert_int(*(src_ptr + 5 * weights_stride_y));                \
-        src0.s6   = convert_int(*(src_ptr + 6 * weights_stride_y));                \
-        src0.s7   = convert_int(*(src_ptr + 7 * weights_stride_y));                \
-        src1.s0   = convert_int(*(src_ptr + 8 * weights_stride_y));                \
-        src1.s1   = convert_int(*(src_ptr + 9 * weights_stride_y));                \
-        src1.s2   = convert_int(*(src_ptr + 10 * weights_stride_y));               \
-        src1.s3   = convert_int(*(src_ptr + 11 * weights_stride_y));               \
-        src1.s4   = convert_int(*(src_ptr + 12 * weights_stride_y));               \
-        src1.s5   = convert_int(*(src_ptr + 13 * weights_stride_y));               \
-        src1.s6   = convert_int(*(src_ptr + 14 * weights_stride_y));               \
-        src1.s7   = convert_int(*(src_ptr + 15 * weights_stride_y));               \
-        \
-        acc += src0 * (int8)weights_values0.s0;                                    \
-        acc += (int8)(src0.s1234, src0.s567, src1.s0) * (int8)weights_values0.s1;  \
-        acc += (int8)(src0.s234, src0.s567, src1.s01) * (int8)weights_values0.s2;  \
-        acc += (int8)(src0.s345, src0.s67, src1.s012) * (int8)weights_values0.s3;  \
-        acc += (int8)(src0.s4567, src1.s0123) * (int8)weights_values0.s4;          \
-        acc += (int8)(src0.s567, src1.s0123, src1.s4) * (int8)weights_values0.s5;  \
-        acc += (int8)(src0.s67, src1.s012, src1.s345) * (int8)weights_values0.s6;  \
-        acc += (int8)(src0.s7, src1.s0123, src1.s456) * (int8)weights_values0.s7;  \
-        acc += src1 * (int8)weights_value1;                                        \
-    })
-
-#define CONVOLUTION1x9_STRIDE2(acc, src_ptr, weights_ptr)                          \
-    ({                                                                             \
-        int8 weights_values0 = 0;                                                  \
-        int  weights_value1  = 0;                                                  \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y)); \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y)); \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y)); \
-        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y)); \
-        weights_values0.s4   = convert_int(*(weights_ptr + 4 * weights_stride_y)); \
-        weights_values0.s5   = convert_int(*(weights_ptr + 5 * weights_stride_y)); \
-        weights_values0.s6   = convert_int(*(weights_ptr + 6 * weights_stride_y)); \
-        weights_values0.s7   = convert_int(*(weights_ptr + 7 * weights_stride_y)); \
-        weights_value1       = convert_int(*(weights_ptr + 8 * weights_stride_y)); \
-        \
-        int16 src0 = 0;                                                            \
-        int8  src1 = 0;                                                            \
-        src0.s0    = convert_int(*(src_ptr + 0 * weights_stride_y));               \
-        src0.s1    = convert_int(*(src_ptr + 1 * weights_stride_y));               \
-        src0.s2    = convert_int(*(src_ptr + 2 * weights_stride_y));               \
-        src0.s3    = convert_int(*(src_ptr + 3 * weights_stride_y));               \
-        src0.s4    = convert_int(*(src_ptr + 4 * weights_stride_y));               \
-        src0.s5    = convert_int(*(src_ptr + 5 * weights_stride_y));               \
-        src0.s6    = convert_int(*(src_ptr + 6 * weights_stride_y));               \
-        src0.s7    = convert_int(*(src_ptr + 7 * weights_stride_y));               \
-        src0.s8    = convert_int(*(src_ptr + 8 * weights_stride_y));               \
-        src0.s9    = convert_int(*(src_ptr + 9 * weights_stride_y));               \
-        src0.sA    = convert_int(*(src_ptr + 10 * weights_stride_y));              \
-        src0.sB    = convert_int(*(src_ptr + 11 * weights_stride_y));              \
-        src0.sC    = convert_int(*(src_ptr + 12 * weights_stride_y));              \
-        src0.sD    = convert_int(*(src_ptr + 13 * weights_stride_y));              \
-        src0.sE    = convert_int(*(src_ptr + 14 * weights_stride_y));              \
-        src0.sF    = convert_int(*(src_ptr + 15 * weights_stride_y));              \
-        src1.s0    = convert_int(*(src_ptr + 16 * weights_stride_y));              \
-        src1.s1    = convert_int(*(src_ptr + 17 * weights_stride_y));              \
-        src1.s2    = convert_int(*(src_ptr + 18 * weights_stride_y));              \
-        src1.s3    = convert_int(*(src_ptr + 19 * weights_stride_y));              \
-        src1.s4    = convert_int(*(src_ptr + 20 * weights_stride_y));              \
-        src1.s5    = convert_int(*(src_ptr + 21 * weights_stride_y));              \
-        src1.s6    = convert_int(*(src_ptr + 22 * weights_stride_y));              \
-        src1.s7    = convert_int(*(src_ptr + 23 * weights_stride_y));              \
-        \
-        acc += src0.s02468ACE * (int8)weights_values0.s0;                          \
-        acc += (int8)(src0.s1357, src0.s9BDF) * (int8)weights_values0.s1;          \
-        acc += (int8)(src0.s2468, src0.sACE, src1.s0) * (int8)weights_values0.s2;  \
-        acc += (int8)(src0.s3579, src0.sBDF, src1.s1) * (int8)weights_values0.s3;  \
-        acc += (int8)(src0.s468A, src0.sCE, src1.s02) * (int8)weights_values0.s4;  \
-        acc += (int8)(src0.s579, src0.sBDF, src1.s13) * (int8)weights_values0.s5;  \
-        acc += (int8)(src0.s68A, src0.sCE, src1.s024) * (int8)weights_values0.s6;  \
-        acc += (int8)(src0.s79B, src0.sDF, src1.s135) * (int8)weights_values0.s7;  \
-        acc += (int8)(src0.s8AC, src0.sE, src1.s0246) * (int8)weights_value1;      \
-    })
-
-#elif KERNEL_SIZE == 5
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1(acc, src_ptr, weights_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x5(acc, src_ptr, weights_ptr) CONVOLUTION1x5_STRIDE2(acc, src_ptr, weights_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_ptr, weights_ptr)                                                            \
-    ({                                                                                                               \
-        int4 weights_values0 = 0;                                                                                    \
-        int  weights_value1  = 0;                                                                                    \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y));                                   \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y));                                   \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y));                                   \
-        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y));                                   \
-        weights_value1       = convert_int(*(weights_ptr + 4 * weights_stride_y));                                   \
-        \
-        int8 src0 = 0;                                                                                               \
-        int4 src1 = 0;                                                                                               \
-        src0.s0   = convert_int(*(src_ptr + 0 * weights_stride_y));                                                  \
-        src0.s1   = convert_int(*(src_ptr + 1 * weights_stride_y));                                                  \
-        src0.s2   = convert_int(*(src_ptr + 2 * weights_stride_y));                                                  \
-        src0.s3   = convert_int(*(src_ptr + 3 * weights_stride_y));                                                  \
-        src0.s4   = convert_int(*(src_ptr + 4 * weights_stride_y));                                                  \
-        src0.s5   = convert_int(*(src_ptr + 5 * weights_stride_y));                                                  \
-        src0.s6   = convert_int(*(src_ptr + 6 * weights_stride_y));                                                  \
-        src0.s7   = convert_int(*(src_ptr + 7 * weights_stride_y));                                                  \
-        src1.s0   = convert_int(*(src_ptr + 8 * weights_stride_y));                                                  \
-        src1.s1   = convert_int(*(src_ptr + 9 * weights_stride_y));                                                  \
-        src1.s2   = convert_int(*(src_ptr + 10 * weights_stride_y));                                                 \
-        src1.s3   = convert_int(*(src_ptr + 11 * weights_stride_y));                                                 \
-        \
-        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset);     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_ptr, weights_ptr)                                                            \
-    ({                                                                                                               \
-        int4 weights_values0 = 0;                                                                                    \
-        int  weights_value1  = 0;                                                                                    \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y));                                   \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y));                                   \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y));                                   \
-        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y));                                   \
-        weights_value1       = convert_int(*(weights_ptr + 4 * weights_stride_y));                                   \
-        \
-        int16 src0 = 0;                                                                                              \
-        int4  src1 = 0;                                                                                              \
-        src0.s0    = convert_int(*(src_ptr + 0 * weights_stride_y));                                                 \
-        src0.s1    = convert_int(*(src_ptr + 1 * weights_stride_y));                                                 \
-        src0.s2    = convert_int(*(src_ptr + 2 * weights_stride_y));                                                 \
-        src0.s3    = convert_int(*(src_ptr + 3 * weights_stride_y));                                                 \
-        src0.s4    = convert_int(*(src_ptr + 4 * weights_stride_y));                                                 \
-        src0.s5    = convert_int(*(src_ptr + 5 * weights_stride_y));                                                 \
-        src0.s6    = convert_int(*(src_ptr + 6 * weights_stride_y));                                                 \
-        src0.s7    = convert_int(*(src_ptr + 7 * weights_stride_y));                                                 \
-        src0.s8    = convert_int(*(src_ptr + 8 * weights_stride_y));                                                 \
-        src0.s9    = convert_int(*(src_ptr + 9 * weights_stride_y));                                                 \
-        src0.sa    = convert_int(*(src_ptr + 10 * weights_stride_y));                                                \
-        src0.sb    = convert_int(*(src_ptr + 11 * weights_stride_y));                                                \
-        src0.sc    = convert_int(*(src_ptr + 12 * weights_stride_y));                                                \
-        src0.sd    = convert_int(*(src_ptr + 13 * weights_stride_y));                                                \
-        src0.se    = convert_int(*(src_ptr + 14 * weights_stride_y));                                                \
-        src0.sf    = convert_int(*(src_ptr + 15 * weights_stride_y));                                                \
-        src1.s0    = convert_int(*(src_ptr + 16 * weights_stride_y));                                                \
-        src1.s1    = convert_int(*(src_ptr + 17 * weights_stride_y));                                                \
-        src1.s2    = convert_int(*(src_ptr + 18 * weights_stride_y));                                                \
-        src1.s3    = convert_int(*(src_ptr + 19 * weights_stride_y));                                                \
-        \
-        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset);     \
-    })
-
-#elif KERNEL_SIZE == 3
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_ptr, weights_ptr) CONVOLUTION1x3_STRIDE1(acc, src_ptr, weights_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x3(acc, src_ptr, weights_ptr) CONVOLUTION1x3_STRIDE2(acc, src_ptr, weights_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_ptr, weights_ptr)                                                            \
-    ({                                                                                                               \
-        int3 weights_values0 = 0;                                                                                    \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y));                                   \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y));                                   \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y));                                   \
-        \
-        int8 src0 = 0;                                                                                               \
-        int2 src1 = 0;                                                                                               \
-        src0.s0   = convert_int(*(src_ptr + 0 * weights_stride_y));                                                  \
-        src0.s1   = convert_int(*(src_ptr + 1 * weights_stride_y));                                                  \
-        src0.s2   = convert_int(*(src_ptr + 2 * weights_stride_y));                                                  \
-        src0.s3   = convert_int(*(src_ptr + 3 * weights_stride_y));                                                  \
-        src0.s4   = convert_int(*(src_ptr + 4 * weights_stride_y));                                                  \
-        src0.s5   = convert_int(*(src_ptr + 5 * weights_stride_y));                                                  \
-        src0.s6   = convert_int(*(src_ptr + 6 * weights_stride_y));                                                  \
-        src0.s7   = convert_int(*(src_ptr + 7 * weights_stride_y));                                                  \
-        src1.s0   = convert_int(*(src_ptr + 8 * weights_stride_y));                                                  \
-        src1.s1   = convert_int(*(src_ptr + 9 * weights_stride_y));                                                  \
-        \
-        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_ptr, weights_ptr)                                                         \
-    ({                                                                                                            \
-        int3 weights_values0 = 0;                                                                                 \
-        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y));                                \
-        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y));                                \
-        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y));                                \
-        \
-        int16 src0 = 0;                                                                                           \
-        int   src1 = 0;                                                                                           \
-        src0.s0    = convert_int(*(src_ptr + 0 * src_stride_y));                                                  \
-        src0.s1    = convert_int(*(src_ptr + 1 * src_stride_y));                                                  \
-        src0.s2    = convert_int(*(src_ptr + 2 * src_stride_y));                                                  \
-        src0.s3    = convert_int(*(src_ptr + 3 * src_stride_y));                                                  \
-        src0.s4    = convert_int(*(src_ptr + 4 * src_stride_y));                                                  \
-        src0.s5    = convert_int(*(src_ptr + 5 * src_stride_y));                                                  \
-        src0.s6    = convert_int(*(src_ptr + 6 * src_stride_y));                                                  \
-        src0.s7    = convert_int(*(src_ptr + 7 * src_stride_y));                                                  \
-        src0.s8    = convert_int(*(src_ptr + 8 * src_stride_y));                                                  \
-        src0.s9    = convert_int(*(src_ptr + 9 * src_stride_y));                                                  \
-        src0.sa    = convert_int(*(src_ptr + 10 * src_stride_y));                                                 \
-        src0.sb    = convert_int(*(src_ptr + 11 * src_stride_y));                                                 \
-        src0.sc    = convert_int(*(src_ptr + 12 * src_stride_y));                                                 \
-        src0.sd    = convert_int(*(src_ptr + 13 * src_stride_y));                                                 \
-        src0.se    = convert_int(*(src_ptr + 14 * src_stride_y));                                                 \
-        src0.sf    = convert_int(*(src_ptr + 15 * src_stride_y));                                                 \
-        src1       = convert_int(*(src_ptr + 16 * src_stride_y));                                                 \
-        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                           \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);      \
-        acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-    })
-
-#elif KERNEL_SIZE == 1
-
-#if STRIDE_X == 3
-#define INPUT_VALUE extract_input_stride3
-#elif STRIDE_X == 2
-#define INPUT_VALUE extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_VALUE extract_input_stride1
-
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X */
-
-#endif // KERNEL_SIZE == 1
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value, const uchar stride_y)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vals;
-    vals.s0 = *(input_value + 0 * stride_y);
-    vals.s1 = *(input_value + 1 * stride_y);
-    vals.s2 = *(input_value + 2 * stride_y);
-    vals.s3 = *(input_value + 3 * stride_y);
-    vals.s4 = *(input_value + 4 * stride_y);
-    vals.s5 = *(input_value + 5 * stride_y);
-    vals.s6 = *(input_value + 6 * stride_y);
-    vals.s7 = *(input_value + 7 * stride_y);
-
-    return vals;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value, const uchar stride_y)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vals;
-    vals.s0 = *(input_value + 0 * stride_y);
-    vals.s1 = *(input_value + 2 * stride_y);
-    vals.s2 = *(input_value + 4 * stride_y);
-    vals.s3 = *(input_value + 6 * stride_y);
-    vals.s4 = *(input_value + 8 * stride_y);
-    vals.s5 = *(input_value + 10 * stride_y);
-    vals.s6 = *(input_value + 12 * stride_y);
-    vals.s7 = *(input_value + 14 * stride_y);
-
-    return vals;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value, const uchar stride_y)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vals;
-    vals.s0 = *(input_value + 0 * stride_y);
-    vals.s1 = *(input_value + 3 * stride_y);
-    vals.s2 = *(input_value + 6 * stride_y);
-    vals.s3 = *(input_value + 9 * stride_y);
-    vals.s4 = *(input_value + 12 * stride_y);
-    vals.s5 = *(input_value + 15 * stride_y);
-    vals.s6 = *(input_value + 18 * stride_y);
-    vals.s7 = *(input_value + 21 * stride_y);
-
-    return vals;
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
- * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  input_offset                          Input offset quantization parameter
- * @param[in]  weight_offset                         Weights offset quantization parameter
- * @param[in]  output_offset                         Output offset quantization parameter
- */
-__kernel void direct_convolution_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w,
-    int          input_offset,
-    int          weight_offset,
-    int          output_offset)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int8 values0 = 0;
-
-    const int id0     = get_global_id(0);
-    const int y_coord = (get_global_id(2) * STRIDE_Y) - PAD_TOP;
-
-    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
-    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0) - src_stride_x * id0 + y_coord * (int)src_stride_z;
-
-    weights_addr += id0 * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if KERNEL_SIZE == 9
-        if(y_coord < 0)
-        {
-            const int start_z = -y_coord;
-            for(int i = start_z; i < 9; ++i)
-            {
-                CONVOLUTION1x9(values0, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
-            }
-        }
-        else if(y_coord > (SRC_HEIGHT - 9))
-        {
-            // Avoid loading rows beyond the input height
-            const int end_z = SRC_HEIGHT - y_coord;
-            for(int i = 0; i < end_z; ++i)
-            {
-                CONVOLUTION1x9(values0, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
-            }
-        }
-        else
-        {
-            CONVOLUTION1x9(values0, src_addr, weights_addr);
-            CONVOLUTION1x9(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9(values0, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-        }
-#elif KERNEL_SIZE == 5
-#if(PAD_TOP == 1) || (PAD_BOTTM == 1)
-        if(y_coord < 0) // special case Z = -1 doesn't exists
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-        }
-        else if(get_global_id(2) == (DST_HEIGHT - 1))
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-        }
-#elif(PAD_TOP == 2) || (PAD_BOTTM == 2)
-        if(y_coord < -1)
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-        }
-        else if(y_coord == -1)
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-        }
-        else if(y_coord == (SRC_HEIGHT - 3))
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-        }
-        else if(y_coord >= (SRC_HEIGHT - 4))
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-            CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-        }
-#else  /*  PAD_TOP == 2 ||  || PAD_BOTTM == 2 */
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_z));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_z));
-#endif /*  PAD_TOP == 1 ||  || PAD_BOTTM == 1 */
-#elif KERNEL_SIZE == 3
-#if(PAD_TOP > 0) || (PAD_BOTTOM > 0)
-        if(y_coord < 0) // special case Z = -1 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-        }
-        else if(y_coord == (SRC_HEIGHT - PAD_BOTTOM - 1))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-            CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-        }
-#else  // PAD_TOP > 0 || PAD_BOTTOM > 0
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_z));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_z));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_z), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_z));
-#endif // PAD_TOP > 0 || PAD_BOTTOM > 0
-#elif KERNEL_SIZE == 1
-        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
-        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr, src_stride_y));
-        values0 += (input_value + input_offset) * ((int8)weight + weight_offset);
-#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
-
-        src_addr += src_stride_x;
-        weights_addr += weights_stride_x;
-    }
-
-#ifdef HAS_BIAS
-    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    __global int *bias_addr = ((__global int *)(vector_offset(&biases, id0)));
-    values0 += (int8)(*bias_addr);
-#endif /* defined(HAS_BIAS) */
-
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_SHIFT < 0
-    values0 = values0 + output_offset;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values                        = CONVERT_SAT(values0, DATA_TYPE);
-    *(dst.ptr + 0 * dst_stride_y) = values.s0;
-    *(dst.ptr + 1 * dst_stride_y) = values.s1;
-    *(dst.ptr + 2 * dst_stride_y) = values.s2;
-    *(dst.ptr + 3 * dst_stride_y) = values.s3;
-    *(dst.ptr + 4 * dst_stride_y) = values.s4;
-    *(dst.ptr + 5 * dst_stride_y) = values.s5;
-    *(dst.ptr + 6 * dst_stride_y) = values.s6;
-    *(dst.ptr + 7 * dst_stride_y) = values.s7;
-}
-
-#else // defined(DATA_LAYOUT_NHWC)
-
-#if KERNEL_SIZE == 9
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                            \
-    ({                                                                                                       \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                    \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                         \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                      \
-        acc += (src0.lo + input_offset) * ((int8)weights_values0.s0 + weight_offset);                        \
-        acc += ((int8)(src0.s1234, src0.s5678) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
-        acc += ((int8)(src0.s2345, src0.s6789) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s3456, src0.s789A) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s4567, src0.s89AB) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \
-        acc += ((int8)(src0.s5678, src0.s9ABC) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \
-        acc += ((int8)(src0.s6789, src0.sABCD) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \
-        acc += ((int8)(src0.s789A, src0.sBCDE) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \
-        acc += ((int8)(src0.s89AB, src0.sCDEF) + input_offset) * ((int8)weights_value1 + weight_offset);     \
-    })
-
-#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                    \
-    ({                                                                                                               \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                            \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                 \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                              \
-        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                           \
-        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + input_offset) * ((int8)weights_values0.s4 + weight_offset); \
-        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + input_offset) * ((int8)weights_values0.s5 + weight_offset); \
-        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + input_offset) * ((int8)weights_values0.s6 + weight_offset); \
-        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + input_offset) * ((int8)weights_values0.s7 + weight_offset); \
-        acc += ((int8)(src0.s8ACE, src1.s0246) + input_offset) * ((int8)weights_value1 + weight_offset);             \
-    })
-
-#elif KERNEL_SIZE == 5
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
-    ({                                                                                                               \
-        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
-        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
-        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                             \
-        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset);     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                    \
-    ({                                                                                                               \
-        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                            \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                 \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                              \
-        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                           \
-        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
-        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset);     \
-    })
-
-#elif KERNEL_SIZE == 3
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
-    ({                                                                                                               \
-        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                             \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
-        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                             \
-        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                 \
-    ({                                                                                                            \
-        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                         \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                           \
-        int   src1            = convert_int(*(src_row_ptr + 16));                                                 \
-        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                           \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);      \
-        acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
-    })
-
-#elif KERNEL_SIZE == 1
-
-#if STRIDE_X == 3
-#define INPUT_VALUE extract_input_stride3
-#elif STRIDE_X == 2
-#define INPUT_VALUE extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_VALUE extract_input_stride1
-
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
-{
-    return vload8(0, input_value);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_value);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_value);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_value + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
-#error "Only kernel sizes 1, 3, 5 and 9 are supported"
-#endif /* KERNEL_SIZE */
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
- * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  input_offset                          Input offset quantization parameter
- * @param[in]  weight_offset                         Weights offset quantization parameter
- * @param[in]  output_offset                         Output offset quantization parameter
- */
-__kernel void direct_convolution_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w,
-    int          input_offset,
-    int          weight_offset,
-    int          output_offset)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int8 values0 = 0;
-
-    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
-    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if KERNEL_SIZE == 9
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
-#elif KERNEL_SIZE == 5
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-#elif KERNEL_SIZE == 3
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-#elif KERNEL_SIZE == 1
-        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
-        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
-        values0 += (input_value + input_offset) * ((int8)weight + weight_offset);
-#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
-    values0 += (int8)(*bias_addr);
-#endif /* defined(HAS_BIAS) */
-
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_SHIFT < 0
-    values0 = values0 + output_offset;
-
-    vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-#endif // defined(DATA_LAYOUT_NHWC)
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
deleted file mode 100644
index 810c5fc51a..0000000000
--- a/src/core/CL/cl_kernels/erode.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function erodes an input image image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void erode(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
-
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
deleted file mode 100644
index 89c144ab5e..0000000000
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
- *
- *      . . F 0 1 . . .
- *      . E . . . 2 . .
- *      D . . . . . 3 .
- *      C . . P . . 4 .
- *      B . . . . . 5 .
- *      . A . . . 6 . .
- *      . . 9 8 7 . . .
- */
-constant int offsets_s[16][2] =
-{
-    { 0, -3 },  // 0
-    { 1, -3 },  // 1
-    { 2, -2 },  // 2
-    { 3, -1 },  // 3
-    { 3, 0 },   // 4
-    { 3, 1 },   // 5
-    { 2, 2 },   // 6
-    { 1, 3 },   // 7
-    { 0, 3 },   // 8
-    { -1, 3 },  // 9
-    { -2, 2 },  // A
-    { -3, 1 },  // B
-    { -3, 0 },  // C
-    { -3, -1 }, // D
-    { -2, -2 }, // E
-    { -1, -3 }, // F
-};
-
-/** Load a pixel and set the mask values.
- *
- * @param[in]  ptr         The pointer to the starting address of source image
- * @param[in]  a           Index to indicate the position in the Bresenham circle
- * @param[in]  stride      Stride of source image in x dimension
- * @param[in]  dark        The left end of the threshold range
- * @param[in]  bright      The right end of the threshold range
- * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
- * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
- *
- */
-#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
-    {                                                                           \
-        unsigned char pixel;                                                    \
-        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
-        dark_mask |= (pixel < dark) << a;                                       \
-        bright_mask |= (pixel > bright) << a;                                   \
-    }
-
-/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
- *
- * @param[in]  bright_mask The mask recording postions of bright pixels
- * @param[in]  dark_mask   The mask recording postions of dark pixels
- * @param[out] isCorner    Indicate whether candidate pixel is corner
- */
-#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
-    {                                                     \
-        for(int i = 0; i < 16; i++)                       \
-        {                                                 \
-            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
-            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
-            if(isCorner)                                  \
-            {                                             \
-                break;                                    \
-            }                                             \
-            bright_mask >>= 1;                            \
-            dark_mask >>= 1;                              \
-        }                                                 \
-    }
-
-/* Calculate pixel's strength */
-uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
-{
-    short a = threshold;
-    short b = 255;
-    while(b - a > 1)
-    {
-        uchar        c           = convert_uchar_sat((a + b) / 2);
-        unsigned int bright_mask = 0;
-        unsigned int dark_mask   = 0;
-
-        unsigned char p_bright = add_sat(candidate_pixel, c);
-        unsigned char p_dark   = sub_sat(candidate_pixel, c);
-
-        bool isCorner = 0;
-
-        for(uint i = 0; i < 16; i++)
-        {
-            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
-        }
-
-        bright_mask |= (bright_mask << 16);
-        dark_mask |= (dark_mask << 16);
-        CHECK_CORNER(bright_mask, dark_mask, isCorner);
-
-        if(isCorner)
-        {
-            a = convert_short(c);
-        }
-        else
-        {
-            b = convert_short(c);
-        }
-    }
-    return a;
-}
-
-/** Fast corners implementation. Calculates and returns the strength of each pixel.
- *
- * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
- * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
- * from the LSB are set.
- *
- * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  threshold_value                      Threshold value.
- *
- */
-__kernel void fast_corners(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output),
-    float threshold_value)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    const unsigned char threshold = (uchar)threshold_value;
-
-    unsigned int bright_mask = 0;
-    unsigned int dark_mask   = 0;
-
-    unsigned char isCorner = 0;
-
-    unsigned char p        = *in.ptr;
-    unsigned char p_bright = add_sat(p, threshold);
-    unsigned char p_dark   = sub_sat(p, threshold);
-
-    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    if(((bright_mask | dark_mask) & 0x1111) == 0)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
-
-    bright_mask |= (bright_mask << 16);
-    dark_mask |= (dark_mask << 16);
-
-    CHECK_CORNER(bright_mask, dark_mask, isCorner)
-
-    if(!isCorner)
-    {
-        *out.ptr = 0;
-        return;
-    }
-
-#ifdef USE_MAXSUPPRESSION
-    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
-#else  /* USE_MAXSUPPRESSION */
-    *out.ptr = 1;
-#endif /* USE_MAXSUPPRESSION */
-}
-
-/** Copy result to Keypoint buffer and count number of corners
- *
- * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
- * @param[out] offset                              The number of skipped pixels in x dimension
- * @param[out] num_of_points                       Number of points found
- * @param[out] out                                 The keypoints found
- *
- */
-__kernel void copy_to_keypoint(
-    IMAGE_DECLARATION(input),
-    uint     max_num_points,
-    uint     offset,
-    __global uint *num_of_points,
-    __global Keypoint *out)
-{
-#ifndef UPDATE_NUMBER
-    if(*num_of_points >= max_num_points)
-    {
-        return;
-    }
-#endif /* UPDATE_NUMBER */
-
-    Image in = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uchar value = *in.ptr;
-
-    if(value > 0)
-    {
-        int id = atomic_inc(num_of_points);
-        if(id < max_num_points)
-        {
-            out[id].strength        = value;
-            out[id].x               = get_global_id(0) + offset;
-            out[id].y               = get_global_id(1) + offset;
-            out[id].tracking_status = 1;
-            out[id].scale           = 0.f;
-            out[id].orientation     = 0.f;
-            out[id].error           = 0.f;
-        }
-    }
-}
diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
deleted file mode 100644
index ae2c31a848..0000000000
--- a/src/core/CL/cl_kernels/gaussian_pyramid.cl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian1x5_sub_x(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values for the convolution (20 bytes needed)
-    uchar16 temp0 = vload16(0, src.ptr);
-    uchar4  temp1 = vload4(0, src.ptr + 16);
-
-    // Convert to USHORT8
-    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
-    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
-    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
-    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
-    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
-
-    // Compute convolution along the X direction
-    ushort8 pixels = l2_data + r2_data;
-    pixels += l1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += r1_data * (ushort8)4;
-
-    // Store result
-    vstore8(pixels, 0, (__global ushort *)dst.ptr);
-}
-
-/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
- *
- * @note Each thread computes 8 pixels
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void gaussian5x1_sub_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
-    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
-    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
-    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
-    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
-
-    // Compute convolution along the Y direction
-    ushort8 pixels = u2_data + d2_data;
-    pixels += u1_data * (ushort8)4;
-    pixels += m_data * (ushort8)6;
-    pixels += d1_data * (ushort8)4;
-
-    // Scale result
-    pixels >>= (ushort8)8;
-
-    // Store result
-    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 54d38655a4..4bef02314f 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,17 +31,17 @@
  * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
  * @param[in] n0     The number of consecutive columns to access. n0 + offset must be <= 16
  * @param[in] x      Vector to access
- * @{
+ *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x)     SCALAR_ACCESS_STR(offset, n0, x)
 
 // offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x)  ((x).s0)
+#define scalar_access_0_2(x)  ((x).s01)
+#define scalar_access_0_3(x)  ((x).s012)
+#define scalar_access_0_4(x)  ((x).s0123)
+#define scalar_access_0_8(x)  ((x).s01234567)
 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
 
 // offset == 1
@@ -100,8 +100,7 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
-    ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
 
 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_TENSOR
 
 /** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
-    ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
 
 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,11 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ROW_n
@@ -394,10 +397,323 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
+/** Partially load the 0 to (n-1)th rows of the given variables
+ * @name LOAD_ROW_PARTIAL_n
+ * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0
+ *
+ * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] LOAD_N0   The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of group LOAD_ROW_PARTIAL_n
+
+/** Partially load a block of the given size LOAD_M0xLOAD_N0
+ * @name LOAD_BLOCK_PARTIAL
+ *
+ * @note The vector width @p N0 is also required for correct partial storing behaviour.
+ * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] LOAD_M0   The number of rows to load. Supported: 1-16
+ * @param[in] LOAD_N0   The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0
+ * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** Load a block that can be partial in both x and y dimensions
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                      PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                            \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                    \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                             \
+    }                                                                                                              \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else                                                                                                           \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
+    }
+/** Load a block that can only be partial in x but not y.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+                                PARTIAL_COND_X)                                                          \
+    if (!(PARTIAL_COND_X))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
+    }
+/** Load a block that can only be partial in y but not x.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ */
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                PARTIAL_COND_Y)                                                          \
+    if (!(PARTIAL_COND_Y))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
+    }
+/** @} */ // end of group LOAD_BLOCK_PARTIAL
+/** Boundary-aware GeMM block load
+ * @name LOAD_BLOCK_BOUNDARY_AWARE
+ * This macro assumes the following schemes to achieve boundary-awareness:
+ *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
+ *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
+ *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
+ * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim.
+ *
+ * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
+ * blocks **at the end**.
+ * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
+ * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
+ *
+ *  *--x-->                         x == 0                        x == 1
+ *  |                  |<------------------------------N-------------------------->|
+ *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
+ *  |     -------------#############################################################
+ *  *     |          | |...............................|...........................|
+ * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
+ *        |          | |...............................|...........................|
+ *        M          --#############################################################
+ *        |          | |                               |...........................|
+ * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
+ *        |          | |                               |...........................|
+ *        |------------#############################################################
+ *
+ * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
+ *
+ * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
+ * and select corresponding load methods such that the boundary detection logic is only added when needed.
+ *
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * The macro will result in a declaration of @p M0 vectors of size @p N0 with data
+ * type @p DATA_TYPE containing values partially loaded from the specified
+ * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be
+ * filled with zeros.
+ *
+ * @param[in] M0               The number of rows to load, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] OFFSET           The offset within a row
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
+ * @{
+ */
+#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case1: No partial blocks in either x or y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+
+#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
+// Case2: Partial blocks in y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
+
+#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
+// Case3: Partial blocks in x
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
+
+#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case4: Partial blocks in both x and y
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+
+#endif    // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+/** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
+
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_TEXTURE2D_ROW_n
  *
@@ -493,10 +809,195 @@
  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
  * @{
  */
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
 /** @} */ // end of group LOAD_TEXTURE2D
 
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
+ * @name LOAD_ROW_INDIRECT_n
+ *
+ * @param[in] N0        The number of columns to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Y         The y-axis offset vector
+ * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
+ * @{
+ */
+#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##0;                                                                            \
+    if (Y_MASK##0 != 0)                                                                     \
+        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##0 = 0;
+
+#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##1;                                                                            \
+    if (Y_MASK##1 != 0)                                                                     \
+        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##1 = 0;
+
+#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##2;                                                                            \
+    if (Y_MASK##2 != 0)                                                                     \
+        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##2 = 0;
+
+#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##3;                                                                            \
+    if (Y_MASK##3 != 0)                                                                     \
+        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##3 = 0;
+
+#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##4;                                                                            \
+    if (Y_MASK##4 != 0)                                                                     \
+        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##4 = 0;
+
+#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##5;                                                                            \
+    if (Y_MASK##5 != 0)                                                                     \
+        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##5 = 0;
+
+#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##6;                                                                            \
+    if (Y_MASK##6 != 0)                                                                     \
+        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##6 = 0;
+
+#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##7;                                                                            \
+    if (Y_MASK##7 != 0)                                                                     \
+        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##7 = 0;
+
+#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
+    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##8;                                                                            \
+    if (Y_MASK##8 != 0)                                                                     \
+        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##8 = 0;
+
+#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##9;                                                                            \
+    if (Y_MASK##9 != 0)                                                                     \
+        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##9 = 0;
+
+#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##A;                                                                            \
+    if (Y_MASK##A != 0)                                                                     \
+        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##A = 0;
+
+#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##B;                                                                            \
+    if (Y_MASK##B != 0)                                                                     \
+        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##B = 0;
+
+#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##C;                                                                            \
+    if (Y_MASK##C != 0)                                                                     \
+        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##C = 0;
+
+#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##D;                                                                            \
+    if (Y_MASK##D != 0)                                                                     \
+        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##D = 0;
+
+#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##E;                                                                            \
+    if (Y_MASK##E != 0)                                                                     \
+        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##E = 0;
+
+#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
+    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
+    BASENAME##F;                                                                            \
+    if (Y_MASK##F != 0)                                                                     \
+        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
+    else                                                                                    \
+        BASENAME##F = 0;
+/** @} */ // end of group LOAD_ROW_INDIRECT_n
+
+/** Load blocks (consecutive rows and columns) with Y offset.
+ * @name LOAD_BLOCK_INDIRECT
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Y         The y-axis offset vector
+ * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
+ * @{
+ */
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+/** @} */ // end of group LOAD_BLOCK_INDIRECT
+
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ELEMENT_n
  *
@@ -605,8 +1106,10 @@
  * @param[in] STRIDE_Y  The stride in y-axis direction
  * @{
  */
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
 
 /** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -624,49 +1127,49 @@
  * @{
  */
 #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
-    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
     Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
     Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
     Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
     Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
     Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
     Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
     Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
     Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
     Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
@@ -704,8 +1207,10 @@
  * @param[in] STRIDE_Y        The stride value in y-axis direction
  * @{
  */
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -716,8 +1221,7 @@
  * @param[in] SCALE     The scale factor
  * @{
  */
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
-    BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
 
 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
@@ -792,7 +1296,7 @@
  * @{
  */
 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE)     SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
 /** @} */ // end of group SCALE_BLOCK
 
 /** Create a new vector containing the values at the given index for a set of given vectors
@@ -804,8 +1308,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                         \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -814,13 +1317,20 @@
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 4)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 8)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+    BASENAME##IDX_COL =                            \
+        (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 8)                                                                                   \
+    BASENAME##IDX_COL =                                                                                      \
+        (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                 (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                   \
+    BASENAME##IDX_COL =                                                                                       \
+        (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                  (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+                                  (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+                                  (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
 /** @} */ // end of group COLUMN_VECTORn
 
 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -832,8 +1342,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -846,47 +1355,47 @@
 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 8)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                                \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
-/** @} */ // end of group COLUMN_VECTORn
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                                        \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                       \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                                                  (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
  * @name TRANSPOSE_K0Xn
  *
  * @param[in] K0       The size of the source vectors
  * @param[in] BASENAME The basename of transposed vectors
- * @param[in] B        The basename of source vectors for transposition
+ * @param[in] BS       The basename of source vectors for transposition
  * @param[in] TYPE     The data type of the transposed vectors
  * @{
  */
-#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) \
-    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
-#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
-    COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);  \
-    COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
-#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
-    TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);    \
-    COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
-#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
-    TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);    \
-    COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
-#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
-    TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);    \
-    COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);  \
-    COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);  \
-    COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);  \
-    COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
-#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
-    TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);     \
-    COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);   \
-    COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
+    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
+    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
+    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
+    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
+    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
+    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
+    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
+    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
+    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
+    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
+    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
+    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
+    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
+    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
 
 /** @} */ // end of group TRANSPOSE_K0Xn
 
@@ -895,37 +1404,37 @@
  * @param[in] K0       The number of source vectors
  * @param[in] IDX_COL  The index value
  * @param[in] BASENAME The basename of the destination vectors
- * @param[in] B        The basename of the source vectors
+ * @param[in] BS       The basename of the source vectors
  * @param[in] TYPE     The data type of the destination vectors
  */
-#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
-    CONCAT(COLUMN_VECTOR, K0)                         \
-    (IDX_COL, BASENAME, B, TYPE);
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
+    CONCAT(COLUMN_VECTOR, K0)                          \
+    (IDX_COL, BASENAME, BS, TYPE);
 
 /** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector
  *
  * @param[in] K0       The number of source vectors
  * @param[in] IDX_COL  The index value
  * @param[in] BASENAME The basename of the destination vectors
- * @param[in] B        The basename of the source vectors
+ * @param[in] BS       The basename of the source vectors
  * @param[in] TYPE     The data type of the destination vectors
  */
-#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
-    CONCAT(COLUMN_VECTOR_SCALAR, K0)                         \
-    (IDX_COL, BASENAME, B, TYPE);
+#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
+    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
+    (IDX_COL, BASENAME, BS, TYPE);
 
 /** Create transposed vectors form the given source vectors
  *
  * @param[in] K0       The size of source vectors
  * @param[in] N0       The number of source vectors
  * @param[in] BASENAME The basename of transposed vectors
- * @param[in] B        The basename of source vectors for transposition
+ * @param[in] BS       The basename of source vectors for transposition
  * @param[in] TYPE     The data type of the transposed vectors
  *
  */
-#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
-    CONCAT(TRANSPOSE_K0X, N0)                      \
-    (K0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
+    CONCAT(TRANSPOSE_K0X, N0)                       \
+    (K0, BASENAME, BS, TYPE);
 
 /** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
  * @name ADD_ROW_n
@@ -934,8 +1443,7 @@
  * @param[in] BIAS     The basename of the added variables
  * @{
  */
-#define ADD_ROW_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
 
 #define ADD_ROW_2(BASENAME, BIAS) \
     ADD_ROW_1(BASENAME, BIAS)     \
@@ -1010,7 +1518,7 @@
  * @{
  */
 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS)     ADD_BLOCK_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK
 
 /** Broadcast (add single value) to the each element of the destination variables
@@ -1020,8 +1528,7 @@
  * @param[in] BIAS     The variable containing the value to add
  * @{
  */
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
 
 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
@@ -1082,6 +1589,7 @@
 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
     BASENAME##F += BIAS;
+/** @} */ // end of group ADD_ROW_BROADCAST_n
 
 /** Broadcast (add a value) to the each element of the destination block (BASENAME)
  * @name ADD_BLOCK_BROADCAST
@@ -1094,7 +1602,7 @@
  * @{
  */
 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)     ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK_BROADCAST
 
 /** Apply activation to the given variables
@@ -1184,8 +1692,10 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1195,6 +1705,7 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
@@ -1286,7 +1797,10 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-/** @} */ // end of group CONVERT_BLOCK
-\ No newline at end of file
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
deleted file mode 100644
index 5f8b4f694e..0000000000
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ /dev/null
@@ -1,3238 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(float));
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(8 * H0)); src_addr_a += 8 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload4(0, src_addr_b + 4 * H0);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 4 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload8(0, src_addr_b + 8 * H0);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float8 c0 = 0.0f;
-    float8 c1 = 0.0f;
-    float8 c2 = 0.0f;
-    float8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = convert_float4(vload4(0, src_addr_a + 4 * V0));
-        b0 = convert_float8(vload8(0, src_addr_b + 8 * H0));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-    float8 bias_f1 = convert_float8(bias1);
-    float8 bias_f2 = convert_float8(bias2);
-    float8 bias_f3 = convert_float8(bias3);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 c_h0 = convert_half8(c0);
-    half8 c_h1 = convert_half8(c1);
-    half8 c_h2 = convert_half8(c2);
-    half8 c_h3 = convert_half8(c3);
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-#if V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half8 a0 = vload8(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload8(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-#else  // V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-#endif // V0 == 1
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(DATA_TYPE)
-#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
- *
- * @note This OpenCL kernel works with floating point data types (F16/F32)
- * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
-                                     IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                     IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                     IMAGE_DECLARATION(dst),
-                                     uint src0_stride_z,
-                                     uint src1_stride_z,
-#if defined(BETA)
-                                     uint src2_stride_z,
-#endif //defined(BETA)
-                                     uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                     ,
-                                     uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                     ,
-                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                    )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(DATA_TYPE);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    int end_row_vec_a = src_addr.s0 + (K * sizeof(DATA_TYPE));
-
-    VECTOR_TYPE acc0 = 0.0f;
-#if M0 > 1
-    VECTOR_TYPE acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    VECTOR_TYPE acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    VECTOR_TYPE acc3 = 0.0f;
-#endif // M0 > 3
-
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-        VECTOR_TYPE b1 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0.s0;
-        acc0 += b1 * (VECTOR_TYPE)a0.s1;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1.s0;
-        acc1 += b1 * (VECTOR_TYPE)a1.s1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2.s0;
-        acc2 += b1 * (VECTOR_TYPE)a2.s1;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3.s0;
-        acc3 += b1 * (VECTOR_TYPE)a3.s1;
-#endif // M0 > 3
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3;
-#endif // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(DATA_TYPE)
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=4.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float4 acc0 = 0.0f;
-
-#if M0 > 1
-    float4 acc1 = 0.0f;
-#endif // M0 > 1
-
-#if M0 > 2
-    float4 acc2 = 0.0f;
-#endif // M0 > 2
-
-#if M0 > 3
-    float4 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        LOAD_BLOCK(M0, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(float);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0, b0.s3, acc0.s3);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1, b0.s3, acc1.s3);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2, b0.s3, acc2.s3);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=2.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
-                                                      IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                      IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                      IMAGE_DECLARATION(dst),
-                                                      uint src0_stride_z,
-                                                      uint src1_stride_z,
-#if defined(BETA)
-                                                      uint src2_stride_z,
-#endif //defined(BETA)
-                                                      uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                      ,
-                                                      uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                      ,
-                                                      uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                     )
-{
-    // Requires 2 N0, C vect2, A vect4, B (2 vload2) // to fix for M0 > 1
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float2 acc0 = 0.0f;
-#if M0 > 1
-    float2 acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    float2 acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    float2 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 8); i += 8)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
-        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
-        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
-        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
-        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
-        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
-        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
-
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
-        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
-        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
-        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
-        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
-        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
-        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
-
-#if M0 > 1
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
-        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
-        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
-        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
-        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
-        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
-        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
-        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
-
-        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
-        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
-        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
-        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
-        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
-        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
-        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
-        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
-        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
-        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
-        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
-        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
-        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
-        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
-        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
-
-        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
-        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
-        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
-        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
-        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
-        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
-        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
-        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
-        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
-        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
-        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
-        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
-        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
-        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
-        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
-
-        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
-        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
-        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
-        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
-        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
-        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
-        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
-        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float) * 8;
-    }
-    // float size increment
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
-
-    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                       ,
-                                                       uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    float8 acc0 = 0.0h;
-#if M0 > 1
-    float8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    float8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    float8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
-#endif                                    // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
-#endif                                    // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
-#endif                                    // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-#if M0 > 1
-    float8 bias_f1 = convert_float8(bias1);
-#endif // M0 > 1
-#if M0 > 2
-    float8 bias_f2 = convert_float8(bias2);
-#endif // M0 > 2
-#if M0 > 3
-    float8 bias_f3 = convert_float8(bias3);
-#endif // M0 > 3
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 acc_h0 = convert_half8(acc0);
-#if M0 > 1
-    half8 acc_h1 = convert_half8(acc1);
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc_h2 = convert_half8(acc2);
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc_h3 = convert_half8(acc3);
-#endif // M0 > 3
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    half8 acc0 = 0.0h;
-#if M0 > 1
-    half8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
-#endif                                   // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
-#endif                                   // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
-#endif                                   // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, half, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
deleted file mode 100644
index 3e3c9fd23c..0000000000
--- a/src/core/CL/cl_kernels/harris_corners.cl
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Function running harris score on 3x3 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_3x3(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float4 gx2  = (float4)0.0f;
-    float4 gy2  = (float4)0.0f;
-    float4 gxgy = (float4)0.0f;
-
-    /* Row0 */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
-
-    float4 l_gx = convert_float4(temp_gx.s0123);
-    float4 m_gx = convert_float4(temp_gx.s1234);
-    float4 r_gx = convert_float4(temp_gx.s2345);
-
-    float4 l_gy = convert_float4(temp_gy.s0123);
-    float4 m_gy = convert_float4(temp_gy.s1234);
-    float4 r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row1 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Row2 */
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
-
-    l_gx = convert_float4(temp_gx.s0123);
-    m_gx = convert_float4(temp_gx.s1234);
-    r_gx = convert_float4(temp_gx.s2345);
-
-    l_gy = convert_float4(temp_gy.s0123);
-    m_gy = convert_float4(temp_gy.s1234);
-    r_gy = convert_float4(temp_gy.s2345);
-
-    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
-    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
-    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x5.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
-
-    float4 gx = convert_float4(temp_gx.s0123);
-    float4 gy = convert_float4(temp_gy.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s1234);
-    gy = convert_float4(temp_gy.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s2345);
-    gy = convert_float4(temp_gy.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s3456);
-    gy = convert_float4(temp_gy.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx.s4567);
-    gy = convert_float4(temp_gy.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 5x5 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_5x5(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -2; i < 3; i++)
-    {
-        res += harris_score_1x5(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
-
-/** Function for calculating harris score 1x7.
- *
- * @param[in] src_gx Pointer to gx gradient image.
- * @param[in] src_gy Pointer to gy gradient image.
- * @param[in] row    Relative row.
- */
-inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
-{
-    float4 gx2  = 0.0f;
-    float4 gy2  = 0.0f;
-    float4 gxgy = 0.0f;
-
-    /* Row */
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
-
-    float4 gx = convert_float4(temp_gx0.s0123);
-    float4 gy = convert_float4(temp_gy0.s0123);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s1234);
-    gy = convert_float4(temp_gy0.s1234);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s2345);
-    gy = convert_float4(temp_gy0.s2345);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s3456);
-    gy = convert_float4(temp_gy0.s3456);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4(temp_gx0.s4567);
-    gy = convert_float4(temp_gy0.s4567);
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
-    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
-    gx2 += (gx * gx);
-    gy2 += (gy * gy);
-    gxgy += (gx * gy);
-
-    return (float16)(gx2, gy2, gxgy, (float4)0);
-}
-
-/** Function running harris score on 7x7 block size
- *
- * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
- *             e.g. -DDATA_TYPE=short.
- *
- * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
- * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
- * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
- * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
- * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
- * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
- * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
- */
-__kernel void harris_score_7x7(
-    IMAGE_DECLARATION(src_gx),
-    IMAGE_DECLARATION(src_gy),
-    IMAGE_DECLARATION(vc),
-    float sensitivity,
-    float strength_thresh,
-    float pow4_normalization_factor)
-{
-    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
-    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
-    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
-
-    /* Gx^2, Gy^2 and Gx*Gy */
-    float16 res = (float16)0.0f;
-
-    /* Compute row */
-    for(int i = -3; i < 4; i++)
-    {
-        res += harris_score_1x7(&src_gx, &src_gy, i);
-    }
-
-    float4 gx2  = res.s0123;
-    float4 gy2  = res.s4567;
-    float4 gxgy = res.s89AB;
-
-    /* Compute trace and determinant */
-    float4 trace = gx2 + gy2;
-    float4 det   = gx2 * gy2 - (gxgy * gxgy);
-
-    /* Compute harris score */
-    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
-
-    mc = select(0.0f, mc, mc > (float4)strength_thresh);
-
-    vstore4(mc, 0, (__global float *)vc.ptr);
-}
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 372ccd91fb..6e05a513ec 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
+#ifndef ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
+#define ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
 
 #include "load_store_utility.h"
 
@@ -44,6 +44,7 @@
 
 #define GPU_ARCH_MIDGARD 0x100
 #define GPU_ARCH_BIFROST 0x200
+#define GPU_ARCH_VALHALL 0x300
 
 /** Concatenate two inputs.
  *
@@ -80,11 +81,11 @@
  * @return The reversed vector
  * @{
  */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x)  ((x))
+#define REV2(x)  ((x).s10)
+#define REV3(x)  ((x).s210)
+#define REV4(x)  ((x).s3210)
+#define REV8(x)  ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn
 
@@ -98,7 +99,7 @@
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s)     REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE
 
 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -110,18 +111,22 @@
  * @{
  */
 #define ROT1_0(x) ((x))
+#define ROT1_1(x) ((x))
 
 #define ROT2_0(x) ((x))
 #define ROT2_1(x) ((x).s10)
+#define ROT2_2(x) ((x))
 
 #define ROT3_0(x) ((x))
 #define ROT3_1(x) ((x).s201)
 #define ROT3_2(x) ((x).s120)
+#define ROT3_3(x) ((x))
 
 #define ROT4_0(x) ((x))
 #define ROT4_1(x) ((x).s3012)
 #define ROT4_2(x) ((x).s2301)
 #define ROT4_3(x) ((x).s1230)
+#define ROT4_4(x) ((x))
 
 #define ROT8_0(x) ((x))
 #define ROT8_1(x) ((x).s70123456)
@@ -131,23 +136,25 @@
 #define ROT8_5(x) ((x).s34567012)
 #define ROT8_6(x) ((x).s23456701)
 #define ROT8_7(x) ((x).s12345670)
-
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT8_8(x) ((x))
+
+#define ROT16_0(x)  ((x))
+#define ROT16_1(x)  ((x).sF0123456789ABCDE)
+#define ROT16_2(x)  ((x).sEF0123456789ABCD)
+#define ROT16_3(x)  ((x).sDEF0123456789ABC)
+#define ROT16_4(x)  ((x).sCDEF0123456789AB)
+#define ROT16_5(x)  ((x).sBCDEF0123456789A)
+#define ROT16_6(x)  ((x).sABCDEF0123456789)
+#define ROT16_7(x)  ((x).s9ABCDEF012345678)
+#define ROT16_8(x)  ((x).s89ABCDEF01234567)
+#define ROT16_9(x)  ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
 #define ROT16_13(x) ((x).s3456789ABCDEF012)
 #define ROT16_14(x) ((x).s23456789ABCDEF01)
 #define ROT16_15(x) ((x).s123456789ABCDEF0)
+#define ROT16_16(x) ((x))
 /** @} */ // end of group ROTs_n
 
 /** Circular-right-shift (rotate-right) the given vector by the given amount.
@@ -161,7 +168,7 @@
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n)     ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE
 
 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -172,11 +179,11 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt)  (dt##1)(0)
+#define V_OFFS2(dt)  (dt##2)(0, 1)
+#define V_OFFS3(dt)  (dt##3)(0, 1, 2)
+#define V_OFFS4(dt)  (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt)  (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
@@ -190,14 +197,216 @@
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s)     VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS
 
 #define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size)     VLOAD_STR(size)
+
+/** Extended partial vload that correctly handles scalar values as well.
+ * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
+ * @name VLOAD_PARTIAL
+ *
+ * @note With this macro, the passed data can be both a vector and a scalar
+ * @note @p load_size needs to be <= @p size
+ * eg 1: Valid
+ * VLOAD_PARTIAL(16, 15) ...;
+ * eg 2: Invalid
+ * VLOAD_PARTIAL(4, 7) ...;
+ *
+ * @param[in] size      The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] load_size The number of lower elements to load. Supported values: 1-16, but has to be <= @p size
+ * @{
+ */
+#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
+#define VLOAD_PARTIAL(size, load_size)     VLOAD_PARTIAL_STR(size, load_size)
 
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+#define NO_LOAD(data, offs, ptr) \
+    {                            \
+    }
+
+// Size == 1 (scalar)
+#define vload_partial_1_0  NO_LOAD
+#define vload_partial_1_1  vload1
+#define vload_partial_1_2  NO_LOAD
+#define vload_partial_1_3  NO_LOAD
+#define vload_partial_1_4  NO_LOAD
+#define vload_partial_1_5  NO_LOAD
+#define vload_partial_1_6  NO_LOAD
+#define vload_partial_1_7  NO_LOAD
+#define vload_partial_1_8  NO_LOAD
+#define vload_partial_1_9  NO_LOAD
+#define vload_partial_1_10 NO_LOAD
+#define vload_partial_1_11 NO_LOAD
+#define vload_partial_1_12 NO_LOAD
+#define vload_partial_1_13 NO_LOAD
+#define vload_partial_1_14 NO_LOAD
+#define vload_partial_1_15 NO_LOAD
+#define vload_partial_1_16 NO_LOAD
+// Size == 2
+#define vload_partial_2_0  NO_LOAD
+#define vload_partial_2_1  vload_partial_1
+#define vload_partial_2_2  vload_partial_2
+#define vload_partial_2_3  NO_LOAD
+#define vload_partial_2_4  NO_LOAD
+#define vload_partial_2_5  NO_LOAD
+#define vload_partial_2_6  NO_LOAD
+#define vload_partial_2_7  NO_LOAD
+#define vload_partial_2_8  NO_LOAD
+#define vload_partial_2_9  NO_LOAD
+#define vload_partial_2_10 NO_LOAD
+#define vload_partial_2_11 NO_LOAD
+#define vload_partial_2_12 NO_LOAD
+#define vload_partial_2_13 NO_LOAD
+#define vload_partial_2_14 NO_LOAD
+#define vload_partial_2_15 NO_LOAD
+#define vload_partial_2_16 NO_LOAD
+// Size == 3
+#define vload_partial_3_0  NO_LOAD
+#define vload_partial_3_1  vload_partial_1
+#define vload_partial_3_2  vload_partial_2
+#define vload_partial_3_3  vload_partial_3
+#define vload_partial_3_4  NO_LOAD
+#define vload_partial_3_5  NO_LOAD
+#define vload_partial_3_6  NO_LOAD
+#define vload_partial_3_7  NO_LOAD
+#define vload_partial_3_8  NO_LOAD
+#define vload_partial_3_9  NO_LOAD
+#define vload_partial_3_10 NO_LOAD
+#define vload_partial_3_11 NO_LOAD
+#define vload_partial_3_12 NO_LOAD
+#define vload_partial_3_13 NO_LOAD
+#define vload_partial_3_14 NO_LOAD
+#define vload_partial_3_15 NO_LOAD
+#define vload_partial_3_16 NO_LOAD
+// Size == 4
+#define vload_partial_4_0  NO_LOAD
+#define vload_partial_4_1  vload_partial_1
+#define vload_partial_4_2  vload_partial_2
+#define vload_partial_4_3  vload_partial_3
+#define vload_partial_4_4  vload_partial_4
+#define vload_partial_4_5  NO_LOAD
+#define vload_partial_4_6  NO_LOAD
+#define vload_partial_4_7  NO_LOAD
+#define vload_partial_4_8  NO_LOAD
+#define vload_partial_4_9  NO_LOAD
+#define vload_partial_4_10 NO_LOAD
+#define vload_partial_4_11 NO_LOAD
+#define vload_partial_4_12 NO_LOAD
+#define vload_partial_4_13 NO_LOAD
+#define vload_partial_4_14 NO_LOAD
+#define vload_partial_4_15 NO_LOAD
+#define vload_partial_4_16 NO_LOAD
+// Size == 8
+#define vload_partial_8_0  NO_LOAD
+#define vload_partial_8_1  vload_partial_1
+#define vload_partial_8_2  vload_partial_2
+#define vload_partial_8_3  vload_partial_3
+#define vload_partial_8_4  vload_partial_4
+#define vload_partial_8_5  vload_partial_5
+#define vload_partial_8_6  vload_partial_6
+#define vload_partial_8_7  vload_partial_7
+#define vload_partial_8_8  vload_partial_8
+#define vload_partial_8_9  NO_LOAD
+#define vload_partial_8_10 NO_LOAD
+#define vload_partial_8_11 NO_LOAD
+#define vload_partial_8_12 NO_LOAD
+#define vload_partial_8_13 NO_LOAD
+#define vload_partial_8_14 NO_LOAD
+#define vload_partial_8_15 NO_LOAD
+#define vload_partial_8_16 NO_LOAD
+// Size == 16
+#define vload_partial_16_0  NO_LOAD
+#define vload_partial_16_1  vload_partial_1
+#define vload_partial_16_2  vload_partial_2
+#define vload_partial_16_3  vload_partial_3
+#define vload_partial_16_4  vload_partial_4
+#define vload_partial_16_5  vload_partial_5
+#define vload_partial_16_6  vload_partial_6
+#define vload_partial_16_7  vload_partial_7
+#define vload_partial_16_8  vload_partial_8
+#define vload_partial_16_9  vload_partial_9
+#define vload_partial_16_10 vload_partial_10
+#define vload_partial_16_11 vload_partial_11
+#define vload_partial_16_12 vload_partial_12
+#define vload_partial_16_13 vload_partial_13
+#define vload_partial_16_14 vload_partial_14
+#define vload_partial_16_15 vload_partial_15
+#define vload_partial_16_16 vload_partial_16
+
+/** Partial vload. Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vload ops
+ * @name vload_partial_n
+ *
+ * @note @p DATA needs to be a vector not a scalar
+ * @note n needs to be <= the vector width of the input variable @p DATA
+ * eg 1: Valid
+ * vload_partial_15(var:float16, 0, 0xabcd);
+ * eg 2: Invalid
+ * vload_partial_7(var:float4, 0, 0xabcd);
+ *
+ * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vload is invoked, thus there's no performance penalty.
+ *
+ * @param[in] DATA   The name of the variable where to load the values
+ * @param[in] OFFSET Offset in n
+ * @param[in] PTR    The base pointer
+ * @{
+ */
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
+
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
+
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
+
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
+
+#define vload_partial_5(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    DATA.s4 = vload1(OFFSET, PTR + 4);
+
+#define vload_partial_6(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
+
+#define vload_partial_7(DATA, OFFSET, PTR)    \
+    vload_partial_4(DATA.s0123, OFFSET, PTR); \
+    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
+
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
+
+#define vload_partial_9(DATA, OFFSET, PTR)        \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    DATA.s8 = vload1(OFFSET, PTR + 8);
+
+#define vload_partial_10(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
+
+#define vload_partial_11(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
+
+#define vload_partial_12(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
+// For vload_partial_{13,14,15}, an 8-vector size has been passed, because vectors size of size 5,6,7 are not supported
+#define vload_partial_13(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_14(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_15(DATA, OFFSET, PTR)       \
+    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
+
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
+/** @} */ // end of groupd vload_partial_n
+/** @} */ // end of groupd VLOAD_PARTIAL
+
+#define PIXEL_UNIT4  1
+#define PIXEL_UNIT8  2
 #define PIXEL_UNIT16 4
 
 /** Utility macro to convert a vector size in pixel unit.
@@ -210,17 +419,45 @@
  * @{
  */
 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size)     CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
 /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
 
 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+    (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord)                                                       \
+    (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+              read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+    (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord)                                                       \
+    (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+             read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_floatx2(img, x_coord, y_coord, values)    \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values)        \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_halfx2(img, x_coord, y_coord, values)     \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values)         \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 /** Utility macro to read a 2D OpenCL image object.
@@ -237,24 +474,44 @@
  * @{
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord)     READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+/** @} */
+
+/** Utility macro to write a 2D OpenCL image object.
+ *
+ * @note Coordinates are not normalized
+ *
+ * @param[in] data_type Data type
+ * @param[in] n0        Number of pixel to write. Only 1,2 and 4 is supported
+ * @param[in] img       OpenCL image object
+ * @param[in] x_coord   The x coordinate for the top-left pixel
+ * @param[in] y_coord   The y coordinate for the top-left pixel
+ * @param[in] values    Values to write
+ *
+ * @{
+ */
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+    write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+    WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+/** @} */
 
 #define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size)     VSTORE_STR(size)
 
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1  float
+#define half1   half
+#define char1   char
+#define uchar1  uchar
+#define short1  short
 #define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1    int
+#define uint1   uint
+#define long1   long
+#define ulong1  ulong
 #define double1 double
 
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR)        *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
 /** Extended partial vstore that correctly handles scalar values as well.
@@ -273,23 +530,23 @@
  * @{
  */
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size)     VSTORE_PARTIAL_STR(size, store_size)
 
 #define NO_STORE(data, offs, ptr) \
     {                             \
     }
 
 // Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0  NO_STORE
+#define vstore_partial_1_1  vstore1
+#define vstore_partial_1_2  NO_STORE
+#define vstore_partial_1_3  NO_STORE
+#define vstore_partial_1_4  NO_STORE
+#define vstore_partial_1_5  NO_STORE
+#define vstore_partial_1_6  NO_STORE
+#define vstore_partial_1_7  NO_STORE
+#define vstore_partial_1_8  NO_STORE
+#define vstore_partial_1_9  NO_STORE
 #define vstore_partial_1_10 NO_STORE
 #define vstore_partial_1_11 NO_STORE
 #define vstore_partial_1_12 NO_STORE
@@ -298,16 +555,16 @@
 #define vstore_partial_1_15 NO_STORE
 #define vstore_partial_1_16 NO_STORE
 // Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0  NO_STORE
+#define vstore_partial_2_1  vstore_partial_1
+#define vstore_partial_2_2  vstore_partial_2
+#define vstore_partial_2_3  NO_STORE
+#define vstore_partial_2_4  NO_STORE
+#define vstore_partial_2_5  NO_STORE
+#define vstore_partial_2_6  NO_STORE
+#define vstore_partial_2_7  NO_STORE
+#define vstore_partial_2_8  NO_STORE
+#define vstore_partial_2_9  NO_STORE
 #define vstore_partial_2_10 NO_STORE
 #define vstore_partial_2_11 NO_STORE
 #define vstore_partial_2_12 NO_STORE
@@ -316,16 +573,16 @@
 #define vstore_partial_2_15 NO_STORE
 #define vstore_partial_2_16 NO_STORE
 // Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0  NO_STORE
+#define vstore_partial_3_1  vstore_partial_1
+#define vstore_partial_3_2  vstore_partial_2
+#define vstore_partial_3_3  vstore_partial_3
+#define vstore_partial_3_4  NO_STORE
+#define vstore_partial_3_5  NO_STORE
+#define vstore_partial_3_6  NO_STORE
+#define vstore_partial_3_7  NO_STORE
+#define vstore_partial_3_8  NO_STORE
+#define vstore_partial_3_9  NO_STORE
 #define vstore_partial_3_10 NO_STORE
 #define vstore_partial_3_11 NO_STORE
 #define vstore_partial_3_12 NO_STORE
@@ -334,16 +591,16 @@
 #define vstore_partial_3_15 NO_STORE
 #define vstore_partial_3_16 NO_STORE
 // Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0  NO_STORE
+#define vstore_partial_4_1  vstore_partial_1
+#define vstore_partial_4_2  vstore_partial_2
+#define vstore_partial_4_3  vstore_partial_3
+#define vstore_partial_4_4  vstore_partial_4
+#define vstore_partial_4_5  NO_STORE
+#define vstore_partial_4_6  NO_STORE
+#define vstore_partial_4_7  NO_STORE
+#define vstore_partial_4_8  NO_STORE
+#define vstore_partial_4_9  NO_STORE
 #define vstore_partial_4_10 NO_STORE
 #define vstore_partial_4_11 NO_STORE
 #define vstore_partial_4_12 NO_STORE
@@ -352,16 +609,16 @@
 #define vstore_partial_4_15 NO_STORE
 #define vstore_partial_4_16 NO_STORE
 // Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0  NO_STORE
+#define vstore_partial_8_1  vstore_partial_1
+#define vstore_partial_8_2  vstore_partial_2
+#define vstore_partial_8_3  vstore_partial_3
+#define vstore_partial_8_4  vstore_partial_4
+#define vstore_partial_8_5  vstore_partial_5
+#define vstore_partial_8_6  vstore_partial_6
+#define vstore_partial_8_7  vstore_partial_7
+#define vstore_partial_8_8  vstore_partial_8
+#define vstore_partial_8_9  NO_STORE
 #define vstore_partial_8_10 NO_STORE
 #define vstore_partial_8_11 NO_STORE
 #define vstore_partial_8_12 NO_STORE
@@ -370,16 +627,16 @@
 #define vstore_partial_8_15 NO_STORE
 #define vstore_partial_8_16 NO_STORE
 // Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0  NO_STORE
+#define vstore_partial_16_1  vstore_partial_1
+#define vstore_partial_16_2  vstore_partial_2
+#define vstore_partial_16_3  vstore_partial_3
+#define vstore_partial_16_4  vstore_partial_4
+#define vstore_partial_16_5  vstore_partial_5
+#define vstore_partial_16_6  vstore_partial_6
+#define vstore_partial_16_7  vstore_partial_7
+#define vstore_partial_16_8  vstore_partial_8
+#define vstore_partial_16_9  vstore_partial_9
 #define vstore_partial_16_10 vstore_partial_10
 #define vstore_partial_16_11 vstore_partial_11
 #define vstore_partial_16_12 vstore_partial_12
@@ -405,17 +662,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vstore_partial_1(DATA, OFFSET, PTR) \
-    vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
 
-#define vstore_partial_2(DATA, OFFSET, PTR) \
-    vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
 
-#define vstore_partial_3(DATA, OFFSET, PTR) \
-    vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
 
-#define vstore_partial_4(DATA, OFFSET, PTR) \
-    vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
 
 #define vstore_partial_5(DATA, OFFSET, PTR)    \
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -429,8 +682,7 @@
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vstore_partial_8(DATA, OFFSET, PTR) \
-    vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
 
 #define vstore_partial_9(DATA, OFFSET, PTR)        \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -460,132 +712,156 @@
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
-#define vstore_partial_16(DATA, OFFSET, PTR) \
-    vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
 /** @} */ // end of groupd vstore_partial_n
 /** @} */ // end of groupd VSTORE_PARTIAL
 
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat   convert_float
+#define convert_float1_sat  convert_float
+#define convert_float2_sat  convert_float2
+#define convert_float3_sat  convert_float3
+#define convert_float4_sat  convert_float4
+#define convert_float8_sat  convert_float8
 #define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_half_sat    convert_float
+#define convert_half1_sat   convert_half
+#define convert_half2_sat   convert_half2
+#define convert_half3_sat   convert_half3
+#define convert_half4_sat   convert_half4
+#define convert_half8_sat   convert_half8
+#define convert_half16_sat  convert_half16
+
+#define convert_float1  convert_float
+#define convert_half1   convert_half
+#define convert_char1   convert_char
+#define convert_uchar1  convert_uchar
+#define convert_short1  convert_short
 #define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1    convert_int
+#define convert_uint1   convert_uint
+#define convert_long1   convert_long
+#define convert_ulong1  convert_ulong
 #define convert_double1 convert_double
 
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_short1_sat convert_short_sat
+#define convert_char1_sat   convert_char_sat
+#define convert_uchar1_sat  convert_uchar_sat
+#define convert_uchar2_sat  convert_uchar2_sat
+#define convert_uchar3_sat  convert_uchar3_sat
+#define convert_uchar4_sat  convert_uchar4_sat
+#define convert_uchar8_sat  convert_uchar8_sat
+#define convert_uchar16_sat convert_uchar16_sat
+#define convert_short1_sat  convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat    convert_int_sat
+#define convert_uint1_sat   convert_uint_sat
+#define convert_long1_sat   convert_long_sat
+#define convert_ulong1_sat  convert_ulong_sat
 #define convert_double1_sat convert_double_sat
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size)     VEC_DATA_TYPE_STR(type, size)
 
 #define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type)     CONVERT_STR(x, type)
 
 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type)     CONVERT_SAT_STR(x, type)
 
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round)     CONVERT_SAT_ROUND_STR(x, type, round)
 
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size)  uchar##size
+#define select_vec_dt_char(size)   char##size
 #define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size)  short##size
+#define select_vec_dt_half(size)   short##size
+#define select_vec_dt_uint(size)   uint##size
+#define select_vec_dt_int(size)    int##size
+#define select_vec_dt_float(size)  int##size
+#define select_vec_dt_ulong(size)  ulong##size
+#define select_vec_dt_long(size)   long##size
 
 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
-
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define SELECT_VEC_DATA_TYPE(type, size)     SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type)               SELECT_VEC_DATA_TYPE_STR(type, 1)
+
+#define signed_int_vec_dt_uchar(size)  char##size
+#define signed_int_vec_dt_char(size)   char##size
+#define signed_int_vec_dt_ushort(size) short##size
+#define signed_int_vec_dt_short(size)  short##size
+#define signed_int_vec_dt_half(size)   short##size
+#define signed_int_vec_dt_uint(size)   int##size
+#define signed_int_vec_dt_int(size)    int##size
+#define signed_int_vec_dt_float(size)  int##size
+#define signed_int_vec_dt_ulong(size)  long##size
+#define signed_int_vec_dt_long(size)   long##size
+
+#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size)     SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type)               SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x)  (x)
+#define sum_reduce_2(x)  ((x).s0) + ((x).s1)
+#define sum_reduce_3(x)  sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x)  sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x)  sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
 
 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
-
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define SUM_REDUCE(x, size)     SUM_REDUCE_STR(x, size)
+
+#define prod_reduce_1(x)  (x)
+#define prod_reduce_2(x)  ((x).s0) * ((x).s1)
+#define prod_reduce_3(x)  prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x)  prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x)  prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
+
+#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
+#define PROD_REDUCE(x, size)     PROD_REDUCE_STR(x, size)
+
+#define max_reduce_1(x)  (x)
+#define max_reduce_2(x)  max(((x).s0), ((x).s1))
+#define max_reduce_3(x)  max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x)  max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x)  max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
 
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
-
-#define VECTOR_DECLARATION(name)     \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)      \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_offset_first_element_in_bytes
+#define MAX_REDUCE(x, size)     MAX_REDUCE_STR(x, size)
+
+#define min_reduce_1(x)  (x)
+#define min_reduce_2(x)  min(((x).s0), ((x).s1))
+#define min_reduce_3(x)  min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x)  min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x)  min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
+
+#define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
+#define MIN_REDUCE(x, size)     MIN_REDUCE_STR(x, size)
+
+#define VECTOR_DECLARATION(name) \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                                                     \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w,                         \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR5D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v,   \
+        uint name##_step_v, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -593,38 +869,47 @@
 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                                           \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                              name##_stride_y, name##_step_y)
 
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                                            \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                            name##_stride_y, 0, name##_stride_z, name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                       \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0)
 
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w,  \
+                                 name##_step_w, mod_size)
 
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name)                                                            \
+    update_tensor4D_workitem_no_step_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                         name##_stride_y, name##_stride_z, name##_stride_w)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
-    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                           name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                           name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
 /** Structure to hold Vector information */
 typedef struct Vector
@@ -673,10 +958,10 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
-    Vector vector =
-    {
+    Vector vector = {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
@@ -696,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }
@@ -722,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint            offset_first_element_in_bytes,
+                                                     uint            stride_x,
+                                                     uint            step_x,
+                                                     uint            stride_y,
+                                                     uint            step_y,
+                                                     uint            stride_z,
+                                                     uint            step_z)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
-    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+               get_global_id(2) * step_z;
     return img;
 }
 
@@ -748,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o
  *
  * @return A 3D tensor object
  */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  get_global_id(2) * step_z;
     return tensor;
 }
 
@@ -775,34 +1068,58 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi
  *
  * @return A 3D tensor object
  */
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+                                       uint            offset_first_element_in_bytes,
+                                       uint            stride_x,
+                                       uint            step_x,
+                                       uint            stride_y,
+                                       uint            step_y,
+                                       uint            stride_z,
+                                       uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
     return tensor;
 }
 
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
-                                             uint step_w,
-                                             uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z,
+                                             uint            stride_w,
+                                             uint            step_w,
+                                             uint            mod_size)
 {
-    Tensor4D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w
-    };
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
 
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+inline Tensor4D update_tensor4D_workitem_no_step_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint stride_y, uint stride_z, uint stride_w)
+{
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes;
     return tensor;
 }
 
@@ -874,7 +1191,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid
 
     const uint x = index;
 
-    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+           tensor->offset_first_element_in_bytes;
 }
 
-#endif // _HELPER_H
+#endif // ACL_SRC_CORE_CL_CL_KERNELS_HELPERS_H
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 59c8fa606d..166260a3c0 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
  * @return The converted vector
  */
 #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type)     CONVERT_DOWN_RTE_STR(x, type)
 
 /** Quantize a floating-point scalar value to 8-bit asymmetric
  *
@@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
-    {                                                                                                                   \
-        VEC_DATA_TYPE(float, size)                                                                                      \
-        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
-        VEC_DATA_TYPE(type, size)                                                                                       \
-        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
-        return res;                                                                                                     \
+#define QUANTIZE_IMPL(type, size)                                                                          \
+    inline VEC_DATA_TYPE(type, size)                                                                       \
+        quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)                 \
+    {                                                                                                      \
+        VEC_DATA_TYPE(float, size)                                                                         \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);      \
+        VEC_DATA_TYPE(type, size)                                                                          \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+        return res;                                                                                        \
     }
 
 /** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-    {                                                                                                                     \
-        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+#define DEQUANTIZE_IMPL(type, size)                                                         \
+    inline VEC_DATA_TYPE(float, size)                                                       \
+        dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                       \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
     }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
-    {                                                                                                                                   \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        mask = (one << exponent) - one;                                                                                                 \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
-        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                               \
+    inline VEC_DATA_TYPE(int, size)                                                                            \
+        asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    {                                                                                                          \
+        const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;                                     \
+        const VEC_DATA_TYPE(int, size) one  = (VEC_DATA_TYPE(int, size))1;                                     \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        mask = (one << exponent) - one;                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                 \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,41 +168,44 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
-    {                                                                                                                               \
-        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
-        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
-        const int k_fractional_bits = 31;                                                                                           \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x2 = ASYMM_MULT(x, x, size);                                                                                                \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
-        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                              \
+    inline VEC_DATA_TYPE(int, size)                                                                           \
+        asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a)       \
+    {                                                                                                         \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                        \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                         \
+        const int k_fractional_bits                      = 31;                                                \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x = a + (1 << (k_fractional_bits - 3));                                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x2 = ASYMM_MULT(x, x, size);                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x3 = ASYMM_MULT(x2, x, size);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;       \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                            \
+            ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                        \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
     }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
  * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in ARM NEON.
+ * Equivalent to the VBSL instruction in Arm® Neon™.
  *
  * @param[in] size Size of vector.
  *
  * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
  */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
-    {                                                                                                                                                                     \
-        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                      \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(                                              \
+        VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                           \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                    \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
-    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
-    {                                                                                                                                                                                                         \
-        if(k_integer_bits > exponent)                                                                                                                                                                         \
-        {                                                                                                                                                                                                     \
-            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
-            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
-                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
-                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
-        }                                                                                                                                                                                                     \
-        \
-        return result;                                                                                                                                                                                        \
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                  \
+    inline VEC_DATA_TYPE(int, size)                                                                                    \
+        exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+                                 int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                            \
+    {                                                                                                                  \
+        if (k_integer_bits > exponent)                                                                                 \
+        {                                                                                                              \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                   \
+            return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),            \
+                                           ASYMM_MULT(result, fp_multiplier, size), result, size);                     \
+        }                                                                                                              \
+                                                                                                                       \
+        return result;                                                                                                 \
     }
 
 /** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
-    {                                                                                                                         \
-        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        mask = k_one_quarter - 1;                                                                                             \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
-        \
-        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
-        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
-        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
-        \
-        if(k_integer_bits > 5)                                                                                                \
-        {                                                                                                                     \
-            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
-            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
-        }                                                                                                                     \
-        \
-        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
-        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+    {                                                                                                                  \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        mask = k_one_quarter - 1;                                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                    \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+                                                                               size);                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                               \
+                                                                                                                       \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);         \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);           \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);              \
+                                                                                                                       \
+        if (k_integer_bits > 5)                                                                                        \
+        {                                                                                                              \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                    \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                \
+        }                                                                                                              \
+                                                                                                                       \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                               \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                             \
     }
 
 /** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,49 +303,51 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
-    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-    {                                                                                                                      \
-        if(exponent < 0)                                                                                                   \
-        {                                                                                                                  \
-            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
-        }                                                                                                                  \
-        \
-        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
-        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
-        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        result = x << exponent;                                                                                            \
-        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
-        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
-        return result;                                                                                                     \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                      \
+    inline VEC_DATA_TYPE(int, size)                                                            \
+        asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                          \
+        if (exponent < 0)                                                                      \
+        {                                                                                      \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                          \
+        }                                                                                      \
+                                                                                               \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+        int threshold                      = ((1 << (31 - exponent)) - 1);                     \
+        VEC_DATA_TYPE(int, size)                                                               \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+        VEC_DATA_TYPE(int, size)                                                               \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+        VEC_DATA_TYPE(int, size)                                                               \
+        result = x << exponent;                                                                \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+        return result;                                                                         \
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the ARM NEON instruction set.
+ * Equivalent to VRHADD in the Arm Arm® Neon™ instruction set.
  *
  * @param[in] size Size of vector.
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-    {                                                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        a64 = convert_long##size(a);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        b64 = convert_long##size(b);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sum = a64 + b64;                                                                                                  \
-        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
-        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
-        return convert_int##size((sum + sign) / 2);                                                                       \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size)                                                           \
+        asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                         \
+        VEC_DATA_TYPE(long, size)                                                             \
+        a64 = convert_long##size(a);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        b64 = convert_long##size(b);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sum                                       = a64 + b64;                                \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                        \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));          \
+        return convert_int##size((sum + sign) / 2);                                           \
     }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
         const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
         VEC_DATA_TYPE(int, size)                                                                             \
-        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        half_denominator                                 = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);         \
         const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
         const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
         VEC_DATA_TYPE(int, size)                                                                             \
         x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
-        for(int i = 0; i < 3; i++)                                                                           \
+        for (int i = 0; i < 3; i++)                                                                          \
         {                                                                                                    \
             VEC_DATA_TYPE(int, size)                                                                         \
             half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
@@ -378,56 +386,78 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Rescaled value.
  */
-#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
-    {                                                                                                                               \
-        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
-        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+#define ASYMM_RESCALE_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size)                                                                     \
+        asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                   \
+        int exponent = src_integer_bits - dst_integer_bits;                                             \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                           \
     }
 
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size)   quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size)       QUANTIZE_STR(input, offset, scale, type, size)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size)     DEQUANTIZE_STR(input, offset, scale, type, size)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)     ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size)                           asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size)                               ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+    asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size)     asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+    exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)     ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)       asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)           ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+    asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
-    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-    {                                                                                                                           \
-        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
-        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
-        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+    asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+    ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                 \
+    inline VEC_DATA_TYPE(int, size)                                                                                 \
+        multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift)                 \
+    {                                                                                                               \
+        const int left_shift  = shift > 0 ? shift : 0;                                                              \
+        const int right_shift = shift > 0 ? 0 : -shift;                                                             \
+        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
     }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+    multiply_by_quantized_multiplier##size(input, qmul, shift)
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)
 QUANTIZE_IMPL(uint, 1)
 QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 2)
+QUANTIZE_IMPL(char, 2)
+QUANTIZE_IMPL(uint, 2)
+QUANTIZE_IMPL(int, 2)
+QUANTIZE_IMPL(uchar, 3)
+QUANTIZE_IMPL(char, 3)
+QUANTIZE_IMPL(uint, 3)
+QUANTIZE_IMPL(int, 3)
 QUANTIZE_IMPL(uchar, 4)
 QUANTIZE_IMPL(ushort, 4)
 QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(int, 4)
+QUANTIZE_IMPL(uchar, 8)
+QUANTIZE_IMPL(char, 8)
+QUANTIZE_IMPL(uint, 8)
+QUANTIZE_IMPL(int, 8)
 QUANTIZE_IMPL(uchar, 16)
 QUANTIZE_IMPL(char, 16)
 QUANTIZE_IMPL(ushort, 16)
@@ -439,9 +469,22 @@ DEQUANTIZE_IMPL(uchar, 1)
 DEQUANTIZE_IMPL(char, 1)
 DEQUANTIZE_IMPL(uint, 1)
 DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 2)
+DEQUANTIZE_IMPL(char, 2)
+DEQUANTIZE_IMPL(uint, 2)
+DEQUANTIZE_IMPL(int, 2)
+DEQUANTIZE_IMPL(uchar, 3)
+DEQUANTIZE_IMPL(char, 3)
+DEQUANTIZE_IMPL(uint, 3)
+DEQUANTIZE_IMPL(int, 3)
 DEQUANTIZE_IMPL(uchar, 4)
 DEQUANTIZE_IMPL(ushort, 4)
 DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(int, 4)
+DEQUANTIZE_IMPL(uchar, 8)
+DEQUANTIZE_IMPL(char, 8)
+DEQUANTIZE_IMPL(uint, 8)
+DEQUANTIZE_IMPL(int, 8)
 DEQUANTIZE_IMPL(uchar, 16)
 DEQUANTIZE_IMPL(char, 16)
 DEQUANTIZE_IMPL(ushort, 16)
diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
deleted file mode 100644
index a93cb4d1c7..0000000000
--- a/src/core/CL/cl_kernels/histogram.cl
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define VATOMIC_INC16(histogram, win_pos)   \
-    {                                       \
-        atomic_inc(histogram + win_pos.s0); \
-        atomic_inc(histogram + win_pos.s1); \
-        atomic_inc(histogram + win_pos.s2); \
-        atomic_inc(histogram + win_pos.s3); \
-        atomic_inc(histogram + win_pos.s4); \
-        atomic_inc(histogram + win_pos.s5); \
-        atomic_inc(histogram + win_pos.s6); \
-        atomic_inc(histogram + win_pos.s7); \
-        atomic_inc(histogram + win_pos.s8); \
-        atomic_inc(histogram + win_pos.s9); \
-        atomic_inc(histogram + win_pos.sa); \
-        atomic_inc(histogram + win_pos.sb); \
-        atomic_inc(histogram + win_pos.sc); \
-        atomic_inc(histogram + win_pos.sd); \
-        atomic_inc(histogram + win_pos.se); \
-        atomic_inc(histogram + win_pos.sf); \
-    }
-
-/** Calculate the histogram of an 8 bit grayscale image.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
-                                __local uint *histogram_local,
-                                __global uint *restrict histogram,
-                                uint                    num_bins,
-                                uint                    offset,
-                                uint                    range,
-                                uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    uint  local_id_x   = get_local_id(0);
-
-    uint local_x_size = get_local_size(0);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            histogram_local[i] = 0;
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            histogram_local[local_id_x] = 0;
-        }
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    VATOMIC_INC16(histogram_local, win_pos);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(num_bins > local_x_size)
-    {
-        for(int i = local_id_x; i < num_bins; i += local_x_size)
-        {
-            atomic_add(histogram + i, histogram_local[i]);
-        }
-    }
-    else
-    {
-        if(local_id_x <= num_bins)
-        {
-            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
-        }
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image's border.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of length of num_bins
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- * @param[out] num_bins                            The number of bins
- * @param[out] offset                              The start of values to use (inclusive)
- * @param[out] range                               The range of a bin
- * @param[out] offrange                            The maximum value (exclusive)
- */
-__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
-                                 __global uint *restrict histogram,
-                                 uint                    num_bins,
-                                 uint                    offset,
-                                 uint                    range,
-                                 uint                    offrange)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint val = (uint)(*input_buffer.ptr);
-
-    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
-
-    if(val >= offset && (val < offrange))
-    {
-        atomic_inc(histogram + win_pos);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
- *
- * Each thread will process 16 pixels and use one local atomic operation per pixel.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256 elements
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
-                                      __local uint *histogram_local,
-                                      __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-
-    uint local_index  = get_local_id(0);
-    uint local_x_size = get_local_size(0);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        histogram_local[i] = 0;
-    }
-
-    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    atomic_inc(histogram_local + vals.s0);
-    atomic_inc(histogram_local + vals.s1);
-    atomic_inc(histogram_local + vals.s2);
-    atomic_inc(histogram_local + vals.s3);
-    atomic_inc(histogram_local + vals.s4);
-    atomic_inc(histogram_local + vals.s5);
-    atomic_inc(histogram_local + vals.s6);
-    atomic_inc(histogram_local + vals.s7);
-    atomic_inc(histogram_local + vals.s8);
-    atomic_inc(histogram_local + vals.s9);
-    atomic_inc(histogram_local + vals.sa);
-    atomic_inc(histogram_local + vals.sb);
-    atomic_inc(histogram_local + vals.sc);
-    atomic_inc(histogram_local + vals.sd);
-    atomic_inc(histogram_local + vals.se);
-    atomic_inc(histogram_local + vals.sf);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(int i = local_index; i < 256; i += local_x_size)
-    {
-        atomic_add(histogram + i, histogram_local[i]);
-    }
-}
-
-/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
- *
- * Each thread will process one pixel using global atomic.
- * When all work items in a work group are done the resulting local histograms are
- * added to the global histogram using global atomics.
- *
- * @note The input image is represented as a two-dimensional array of type uchar.
- * The output is represented as a one-dimensional uint array of 256
- *
- * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
- * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
- * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
- * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
- */
-__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
-                                       __global uint *restrict histogram)
-{
-    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
-    atomic_inc(histogram + *input_buffer.ptr);
-}
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
deleted file mode 100644
index b14f361df6..0000000000
--- a/src/core/CL/cl_kernels/hog.cl
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
-
-/** This OpenCL kernel computes the HOG orientation binning
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DCELL_WIDTH = Width of the cell
- * -# -DCELL_HEIGHT = height of the cell
- * -# -DNUM_BINS = Number of bins for each cell
- * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
- *
- * @note Each work-item computes a single cell
- *
- * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
- * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
- * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
- * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
- * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
- * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
- * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
- * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
- * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
- */
-__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
-                                      IMAGE_DECLARATION(phase),
-                                      IMAGE_DECLARATION(dst))
-{
-    float bins[NUM_BINS] = { 0 };
-
-    // Compute address for the magnitude and phase images
-    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-
-    __global uchar *mag_row_ptr   = mag.ptr;
-    __global uchar *phase_row_ptr = phase.ptr;
-
-    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
-    {
-        int xc = 0;
-        for(; xc <= (CELL_WIDTH - 4); xc += 4)
-        {
-            // Load magnitude and phase values
-            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
-            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
-
-            // Scale phase: phase * scale + 0.5f
-            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
-
-            // Compute histogram index.
-            int4 hidx_s32 = convert_int4(phase_f32);
-
-            // Compute magnitude weights (w0 and w1)
-            const float4 hidx_f32 = convert_float4(hidx_s32);
-
-            // w1 = phase_f32 - hidx_s32
-            const float4 w1_f32 = phase_f32 - hidx_f32;
-
-            // w0 = 1.0 - w1
-            const float4 w0_f32 = (float4)1.0f - w1_f32;
-
-            // Calculate the weights for splitting vote
-            const float4 mag_w0_f32 = mag_f32 * w0_f32;
-            const float4 mag_w1_f32 = mag_f32 * w1_f32;
-
-            // Weighted vote between 2 bins
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin 0
-            bins[hidx_s32.s0] += mag_w0_f32.s0;
-            bins[hidx_s32.s1] += mag_w0_f32.s1;
-            bins[hidx_s32.s2] += mag_w0_f32.s2;
-            bins[hidx_s32.s3] += mag_w0_f32.s3;
-
-            hidx_s32 += (int4)1;
-
-            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
-            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
-
-            // Bin1
-            bins[hidx_s32.s0] += mag_w1_f32.s0;
-            bins[hidx_s32.s1] += mag_w1_f32.s1;
-            bins[hidx_s32.s2] += mag_w1_f32.s2;
-            bins[hidx_s32.s3] += mag_w1_f32.s3;
-        }
-
-        // Left over computation
-        for(; xc < CELL_WIDTH; xc++)
-        {
-            const float mag_value   = *((__global short *)mag_row_ptr + xc);
-            const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
-            const float w1          = phase_value - floor(phase_value);
-
-            // The quantised phase is the histogram index [0, NUM_BINS - 1]
-            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
-            const uint hidx = (uint)(phase_value) % NUM_BINS;
-
-            // Weighted vote between 2 bins
-            bins[hidx] += mag_value * (1.0f - w1);
-            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
-        }
-
-        // Point to the next row of magnitude and phase images
-        mag_row_ptr += mag_stride_y;
-        phase_row_ptr += phase_stride_y;
-    }
-
-    // Compute address for the destination image
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Store the local HOG in the global memory
-    int xc = 0;
-    for(; xc <= (NUM_BINS - 4); xc += 4)
-    {
-        float4 values = vload4(0, bins + xc);
-
-        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
-    }
-
-    // Left over stores
-    for(; xc < NUM_BINS; ++xc)
-    {
-        ((__global float *)dst.ptr)[xc] = bins[xc];
-    }
-}
-#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
-
-#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
-
-#ifndef L2_NORM
-#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-#endif /* not L2_NORM */
-
-#ifndef L2HYS_NORM
-#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-#endif /* not L2HYS_NORM */
-
-#ifndef L1_NORM
-#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-#endif /* not L1_NORM */
-
-/** This OpenCL kernel computes the HOG block normalization
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
- * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
- * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
- * -# -DHOG_NORM_TYPE = Normalization type
- * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
- * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
- * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
- * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
- *
- * @note Each work-item computes a single block
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
-                                      IMAGE_DECLARATION(dst))
-{
-    float  sum     = 0.0f;
-    float4 sum_f32 = (float4)(0.0f);
-
-    // Compute address for the source and destination tensor
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
-    {
-        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
-
-        int xc = 0;
-        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
-        {
-            const float4 val0 = vload4(0, hist_ptr + xc + 0);
-            const float4 val1 = vload4(0, hist_ptr + xc + 4);
-            const float4 val2 = vload4(0, hist_ptr + xc + 8);
-            const float4 val3 = vload4(0, hist_ptr + xc + 12);
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            // Compute val^2 for L2_NORM or L2HYS_NORM
-            sum_f32 += val0 * val0;
-            sum_f32 += val1 * val1;
-            sum_f32 += val2 * val2;
-            sum_f32 += val3 * val3;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            // Compute |val| for L1_NORM
-            sum_f32 += fabs(val0);
-            sum_f32 += fabs(val1);
-            sum_f32 += fabs(val2);
-            sum_f32 += fabs(val3);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
-            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
-            // will be accessed consecutively
-            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
-            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
-        }
-
-        // Compute left over
-        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
-        {
-            const float val = hist_ptr[xc];
-
-#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
-            sum += val * val;
-#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-            sum += fabs(val);
-#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
-
-            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
-        }
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
-
-#if(HOG_NORM_TYPE == L2HYS_NORM)
-    // Reset sum
-    sum_f32 = (float4)0.0f;
-    sum     = 0.0f;
-
-    int k = 0;
-    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
-
-        // Scale val
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        // Clip val if over _threshold_l2hys
-        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
-        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
-        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
-        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
-
-        // Compute val^2
-        sum_f32 += val0 * val0;
-        sum_f32 += val1 * val1;
-        sum_f32 += val2 * val2;
-        sum_f32 += val3 * val3;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
-    }
-
-    // Compute left over
-    for(; k < NUM_BINS_PER_BLOCK; ++k)
-    {
-        float val = ((__global float *)dst.ptr)[k] * scale;
-
-        // Clip scaled input_value if over L2_HYST_THRESHOLD
-        val = fmin(val, (float)L2_HYST_THRESHOLD);
-
-        sum += val * val;
-
-        ((__global float *)dst.ptr)[k] = val;
-    }
-
-    sum += dot(sum_f32, (float4)1.0f);
-
-    // We use the same constants of OpenCV
-    scale = 1.0f / (sqrt(sum) + 1e-3f);
-
-#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
-
-    int i = 0;
-    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
-    {
-        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
-        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
-        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
-        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
-
-        // Multiply val by the normalization scale factor
-        val0 = val0 * (float4)scale;
-        val1 = val1 * (float4)scale;
-        val2 = val2 * (float4)scale;
-        val3 = val3 * (float4)scale;
-
-        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
-        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
-        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
-        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
-    }
-
-    for(; i < NUM_BINS_PER_BLOCK; ++i)
-    {
-        ((__global float *)dst.ptr)[i] *= scale;
-    }
-}
-#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-
-#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
-
-/** This OpenCL kernel computes the HOG detector using linear SVM
- *
- * @attention The following variables must be passed at compile time:
- *
- * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
- * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
- * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
- * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
- * -# -DIDX_CLASS = Index of the class to detect
- * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
- * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
- * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
- * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
- *
- * @note Each work-item computes a single detection window
- *
- * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
- * @param[out] dst                               Pointer to DetectionWindow array
- * @param[out] num_detection_windows             Number of objects detected
- */
-__kernel void hog_detector(IMAGE_DECLARATION(src),
-                           __global float *hog_descriptor,
-                           __global DetectionWindow *dst,
-                           __global uint *num_detection_windows)
-{
-    // Check if the DetectionWindow array is full
-    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
-    {
-        return;
-    }
-
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    const int src_step_y_f32 = src_stride_y / sizeof(float);
-
-    // Init score_f32 with 0
-    float4 score_f32 = (float4)0.0f;
-
-    // Init score with 0
-    float score = 0.0f;
-
-    __global float *src_row_ptr = (__global float *)src.ptr;
-
-    // Compute Linear SVM
-    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
-    {
-        int xb = 0;
-
-        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
-
-        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
-        {
-            // Load descriptor values
-            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
-            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
-
-            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
-            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
-
-            // Multiply accumulate
-            score_f32 += a0_f32 * b0_f32;
-            score_f32 += a1_f32 * b1_f32;
-        }
-
-        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
-        {
-            const float a = src_row_ptr[xb];
-            const float b = hog_descriptor[xb + offset_y];
-
-            score += a * b;
-        }
-    }
-
-    score += dot(score_f32, (float4)1.0f);
-
-    // Add the bias. The bias is located at the position (descriptor_size() - 1)
-    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
-    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
-
-    if(score > (float)THRESHOLD)
-    {
-        int id = atomic_inc(num_detection_windows);
-        if(id < MAX_NUM_DETECTION_WINDOWS)
-        {
-            dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
-            dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
-            dst[id].width     = DETECTION_WINDOW_WIDTH;
-            dst[id].height    = DETECTION_WINDOW_HEIGHT;
-            dst[id].idx_class = IDX_CLASS;
-            dst[id].score     = score;
-        }
-    }
-}
-#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
-        * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
deleted file mode 100644
index dd2c7982f4..0000000000
--- a/src/core/CL/cl_kernels/integral_image.cl
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function computes the horizontal integral of the image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void integral_horizontal(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    uint prev = 0;
-
-    for(uint j = 0; j < src_step_x; j += 16)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
-        res.s0 += prev;
-        res.s1 += res.s0;
-        res.s2 += res.s1;
-        res.s3 += res.s2;
-        res.s4 += res.s3;
-        res.s5 += res.s4;
-        res.s6 += res.s5;
-        res.s7 += res.s6;
-        res.s8 += res.s7;
-        res.s9 += res.s8;
-        res.sA += res.s9;
-        res.sB += res.sA;
-        res.sC += res.sB;
-        res.sD += res.sC;
-        res.sE += res.sD;
-        res.sF += res.sE;
-        prev = res.sF;
-        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
-    }
-}
-
-/** This function computes the vertical integral of the image.
- *
- * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
- * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]     height                            Image height.
- */
-__kernel void integral_vertical(
-    IMAGE_DECLARATION(src),
-    uint height)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
-    for(uint j = 1; j < height; ++j)
-    {
-        barrier(CLK_GLOBAL_MEM_FENCE);
-        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
-        res += prev;
-        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
-        prev = res;
-    }
-}
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
deleted file mode 100644
index 14b37e3257..0000000000
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This kernel performs l2 normalization on x-axis
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                           Epsilon value
- */
-__kernel void l2_normalize_x(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(sum),
-    IMAGE_DECLARATION(dst),
-    DATA_TYPE epsilon)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, (__global DATA_TYPE *)src.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
-
-    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** This kernel performs l2 normalization on y-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                           Epsilon value
- */
-__kernel void l2_normalize_y(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(sum),
-    IMAGE_DECLARATION(dst),
-    DATA_TYPE epsilon)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, (__global DATA_TYPE *)src.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
-
-    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
-}
-/** This kernel performs l2 normalization on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                           Epsilon value
- */
-__kernel void l2_normalize_z(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst),
-    DATA_TYPE epsilon)
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D sum = CONVERT_TO_TENSOR3D_STRUCT(sum);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, (__global DATA_TYPE *)src.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
-
-    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
-}
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 56b1538c6f..4daf0adc89 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -223,8 +223,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
 /** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group CONVERT_STORE_BLOCK
 
 /** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** Store a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                     \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else                                                                                                                                                  \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                       PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                    \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                             \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                             \
+    }                                                                                                       \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -425,7 +432,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                         \
+    if (!(PARTIAL_COND_X))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -453,7 +460,7 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
+    if (!(PARTIAL_COND_Y))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -463,8 +470,6 @@
     }
 /** @} */ // end of group STORE_BLOCK_PARTIAL
 
-#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
 /** Boundary-aware GEMM block store
  * @name STORE_BLOCK_BOUNDARY_AWARE
  * This macro assumes the following schemes to achieve boundary-awareness:
@@ -516,32 +521,37 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  * @{
  */
+#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
 #endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 /** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
 
-#if defined(PARTIAL_STORE_M0)
 /** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
  * @name COMPUTE_M0_START_ROW
  * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
@@ -558,16 +568,16 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
  * @{
  */
+#if defined(PARTIAL_STORE_M0)
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
 /** @} */ // end of group COMPUTE_M0_START_ROW
 
 /** Store a vector that can only be partial in x.
- *
+ * @name STORE_VECTOR_SELECT
  * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
  *
  * The data to store is expected to end in a 0.
@@ -583,4 +593,4 @@
  */
 #define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
     STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
-/** @} */ // end of group STORE_VECTOR_SELECT
-\ No newline at end of file
+/** @} */ // end of group STORE_VECTOR_SELECT
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
deleted file mode 100644
index 48197d6473..0000000000
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Calculates L1 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L1 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates L2 normalization between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return L2 normalization magnitude result. Supported data types: S16, S32
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
-{
-    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
-                       VEC_DATA_TYPE(DATA_TYPE, 16));
-}
-
-/** Calculates unsigned phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
- */
-inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
-    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
-    return convert_uchar16(angle_deg_f32);
-}
-
-/** Calculates signed phase between two inputs.
- *
- * @param[in] a First input. Supported data types: S16, S32
- * @param[in] b Second input. Supported data types: S16, S32
- *
- * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
- */
-inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
-{
-    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
-    arct         = select(arct, arct + 2, arct < 0.0f);
-
-    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu);
-}
-
-#if(1 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
-#elif(2 == MAGNITUDE)
-#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
-#else /* MAGNITUDE */
-#define MAGNITUDE_OP(x, y)
-#endif /* MAGNITUDE */
-
-#if(1 == PHASE)
-#define PHASE_OP(x, y) phase_unsigned((x), (y))
-#elif(2 == PHASE)
-#define PHASE_OP(x, y) phase_signed((x), (y))
-#else /* PHASE */
-#define PHASE_OP(x, y)
-#endif /* PHASE */
-
-/** Calculate the magnitude and phase of given the gradients of an image.
- *
- * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
- * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
- *
- * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
- * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
- * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
- *
- * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
- * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
- * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
- * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
- * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
- * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
- * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
- * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
- * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
- * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
- * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
- * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
- * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
- * */
-__kernel void magnitude_phase(
-    IMAGE_DECLARATION(gx),
-    IMAGE_DECLARATION(gy)
-#ifdef MAGNITUDE
-    ,
-    IMAGE_DECLARATION(magnitude)
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    ,
-    IMAGE_DECLARATION(phase)
-#endif /* PHASE */
-)
-{
-    // Get pixels pointer
-    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
-    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
-
-    // Calculate and store the results
-#ifdef MAGNITUDE
-    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
-    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
-#endif /* MAGNITUDE */
-#ifdef PHASE
-    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
-    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
-#endif /* PHASE */
-}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
deleted file mode 100644
index 4ddf931e4b..0000000000
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-/** This function calculates the sum and sum of squares of a given input image.
- *
- * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  height                            Height of the input image
- * @param[out] global_sum                        Global sum of all elements
- * @param[out] global_sum_sq                     Global sum of squares of all elements
- */
-__kernel void mean_stddev_accumulate(
-    IMAGE_DECLARATION(src),
-    uint     height,
-    __global ulong *global_sum
-#ifdef STDDEV
-    ,
-    __global ulong *global_sum_sq
-#endif /* STDDEV */
-)
-{
-    // Get pixels pointer
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    uint8 tmp_sum = 0;
-#ifdef STDDEV
-    uint8 tmp_sum_sq = 0;
-#endif /* STDDEV */
-    // Calculate partial sum
-    for(int i = 0; i < height; i++)
-    {
-        // Load data
-        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
-
-        tmp_sum += data;
-#ifdef STDDEV
-        tmp_sum_sq += data * data;
-#endif /* STDDEV */
-    }
-    // Perform reduction
-    tmp_sum.s0123 += tmp_sum.s4567;
-    tmp_sum.s01 += tmp_sum.s23;
-    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
-
-#ifdef STDDEV
-    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
-    tmp_sum_sq.s01 += tmp_sum_sq.s23;
-    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
-#endif /* STDDEV */
-}
-
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
deleted file mode 100644
index 1045f22fb1..0000000000
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-#ifndef DATA_TYPE_MIN
-#define DATA_TYPE_MIN 0x0
-#endif /* DATA_TYPE_MIN */
-
-#ifndef DATA_TYPE_MAX
-#define DATA_TYPE_MAX 0xFF
-#endif /* DATA_TYPE_MAX */
-
-inline int FloatFlip(float val)
-{
-    union
-    {
-        int   int_val;
-        float flt_val;
-    } u_val;
-    u_val.flt_val = val;
-    return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
-}
-
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-/** This function identifies the min and maximum value of an input image.
- *
- * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
- * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
- * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[in]  width                             Input image width
- */
-__kernel void minmax(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    int           width)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Initialize local minimum and local maximum
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_min = type_max;
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    local_max = type_min;
-
-    // Calculate min/max of row
-    int i = 0;
-    for(; i + 16 <= width; i += 16)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-        local_min = min(data, local_min);
-        local_max = max(data, local_max);
-    }
-
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
-#ifdef IS_DATA_TYPE_FLOAT
-    int16 valid_indices = (i + idx16) < width;
-#else  /* IS_DATA_TYPE_FLOAT */
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
-#endif /* IS_DATA_TYPE_FLOAT */
-    local_max = max(local_max, select(type_min, data, valid_indices));
-    local_min = min(local_min, select(type_max, data, valid_indices));
-#endif /* NON_MULTIPLE_OF_16 */
-
-    // Perform min/max reduction
-    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
-    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
-
-    local_min.s0123 = min(local_min.s0123, local_min.s4567);
-    local_max.s0123 = max(local_max.s0123, local_max.s4567);
-
-    local_min.s01 = min(local_min.s01, local_min.s23);
-    local_max.s01 = max(local_max.s01, local_max.s23);
-
-    local_min.s0 = min(local_min.s0, local_min.s1);
-    local_max.s0 = max(local_max.s0, local_max.s1);
-
-    // Update global min/max
-#ifdef IS_DATA_TYPE_FLOAT
-    atomic_min(&min_max[0], FloatFlip(local_min.s0));
-    atomic_max(&min_max[1], FloatFlip(local_max.s0));
-#else  /* IS_DATA_TYPE_FLOAT */
-    atomic_min(&min_max[0], local_min.s0);
-    atomic_max(&min_max[1], local_max.s0);
-#endif /* IS_DATA_TYPE_FLOAT */
-}
-
-/** This function counts the min and max occurrences in an image and tags their position.
- *
- * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
- * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
- * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
- * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
- * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
- * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
- * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
- */
-__kernel void minmaxloc(
-    IMAGE_DECLARATION(src),
-    __global int *min_max,
-    __global uint *min_max_count
-#ifdef LOCATE_MIN
-    ,
-    __global Coordinates2D *min_loc, uint max_min_loc_count
-#endif /* LOCATE_MIN */
-#ifdef LOCATE_MAX
-    ,
-    __global Coordinates2D *max_loc, uint max_max_loc_count
-#endif /* LOCATE_MAX */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-#ifdef IS_DATA_TYPE_FLOAT
-    __global float *min_max_ptr = (__global float *)min_max;
-    float           min_value   = min_max_ptr[0];
-    float           max_value   = min_max_ptr[1];
-#else  /* IS_DATA_TYPE_FLOAT */
-    int min_value = min_max[0];
-    int max_value = min_max[1];
-#endif /* IS_DATA_TYPE_FLOAT */
-
-    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
-#ifdef COUNT_MIN_MAX
-    if(value == min_value)
-    {
-        uint idx = atomic_inc(&min_max_count[0]);
-#ifdef LOCATE_MIN
-        if(idx < max_min_loc_count)
-        {
-            min_loc[idx].x = get_global_id(0);
-            min_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MIN */
-    }
-    if(value == max_value)
-    {
-        uint idx = atomic_inc(&min_max_count[1]);
-#ifdef LOCATE_MAX
-        if(idx < max_max_loc_count)
-        {
-            max_loc[idx].x = get_global_id(0);
-            max_loc[idx].y = get_global_id(1);
-        }
-#endif /* LOCATE_MAX */
-    }
-#endif /* COUNT_MIN_MAX */
-}
diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
index 1217a37345..d83e81347e 100644
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,14 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Space to depth transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -39,6 +41,12 @@
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,29 +56,38 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void space_to_depth_nchw(
+__kernel void batch_to_space_nchw(
     TENSOR4D_DECLARATION(input),
     const int batch_id,
+    VECTOR_DECLARATION(block_shape),
     TENSOR3D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
+    const int z = get_global_id(2);
 
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * block_x) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, in_batch));
 }
-/** Space to depth transformation. (NHWC)
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -90,22 +107,25 @@ __kernel void space_to_depth_nchw(
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void space_to_depth_nhwc(
+__kernel void batch_to_space_static_nchw(
     TENSOR4D_DECLARATION(input),
     const int batch_id,
     TENSOR3D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int x = get_global_id(0) + CROP_LEFT;
+    const int y = get_global_id(1) + CROP_TOP;
+    const int z = get_global_id(2);
 
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * block_x) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, in_batch));
 }
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(2);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..84396e122f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x)/(y)); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..57183393d2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = scale[get_global_id(2)];
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution.cl b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
new file mode 100644
index 0000000000..866f62da95
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
+ * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    const int id0 = get_global_id(0);
+    const int id1 = get_global_id(1);
+    const int id2 = get_global_id(2);
+
+    const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
+    const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
+
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr     = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
+    __global uchar *dst_addr     = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
+
+#ifdef IS_QUANTIZED
+    int acc_value = 0;
+#else  /* IS_QUANTIZED */
+    DATA_TYPE                 acc_value = 0;
+#endif /* IS_QUANTIZED */
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        for(int y = 0; y < WEI_HEIGHT; ++y)
+        {
+            for(int x = 0; x < WEI_WIDTH; ++x)
+            {
+                const int idx_x = (x_coords + x);
+                const int idx_y = (y_coords + y);
+                if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
+                {
+                    const int weight_offset = x + (WEI_HEIGHT * y);
+                    const int input_offset  = idx_x + SRC_WIDTH * idx_y;
+#ifdef IS_QUANTIZED
+                    int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
+                    int input  = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
+                    acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
+#else  /* IS_QUANTIZED */
+                    DATA_TYPE weight    = *((__global DATA_TYPE *)weights_addr + weight_offset);
+                    DATA_TYPE input     = *((__global DATA_TYPE *)src_addr + input_offset);
+                    acc_value += input * weight;
+#endif /* IS_QUANTIZED */
+                }
+            }
+        }
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#ifdef IS_QUANTIZED
+    int bias = *((__global int *)(vector_offset(&biases, id2)));
+#else  /* IS_QUANTIZED */
+    DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
+#endif /* IS_QUANTIZED */
+    acc_value += bias;
+
+#endif /* defined(HAS_BIAS) */
+
+#ifdef IS_QUANTIZED
+
+#if OUTPUT_SHIFT < 0
+    acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#else  // OUTPUT_SHIFT < 0
+    acc_value      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
+#endif // OUTPUT_SHIFT < 0
+    acc_value = acc_value + OUTPUT_OFFSET;
+#endif /* IS_QUANTIZED */
+
+    *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
index a1467a0b36..fddf918c63 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-
 #if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
 
 #if ELEMENT_SIZE == 1
@@ -861,500 +860,4 @@ __kernel void im2col_generic_padx0_pady0_nchw(
 #endif // HAS_BIAS
 }
 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-
-#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
-
-/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
- *  @name IM2COL1X9_NHWC_STORE
- *
- *  @note To use this macro for a 3x3 block, @p ROW has to be 0
- *
- * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
- * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
- * @param[in] DATA_TYPE            Data type of @p DATA
- * @param[in] SRC_DEPTH            Input channel size / depth
- * @param[in] DATA                 Value variable base name
- * @param[in] ROW                  The row number to store. Supported: 0-8
- * @param[in] OUTPUT_PTR           Output pointer
- * @{
- */
-#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
-    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
-    if(at_channel_boundary)                                                                                          \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    }                                                                                                                \
-    else                                                                                                             \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
-    }
-#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
-#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-
-#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-
-#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-/** @}*/
-
-/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int3 offset   = 0;
-
-    // Clamp xi
-    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
-    xi_offset *= (int3)src_stride_y;
-
-    // Out-of-bound condition for X
-    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
-
-    // yi == 0
-    // Clamp yi
-    // yi_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
-    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
-    // also causes y_cond (y padding condition) to be satisfied
-    yi_coord = yi - (int)PAD_TOP;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
-    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 1
-    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
-    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with zeros
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 2
-    // Clamp yi_coord
-    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // Store in a boundary-aware way to avoid padding
-    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
-        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
-        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
-        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
-        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
-        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
-        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
-        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
-        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
-        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col9x9_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int8 offset0  = 0;
-    int  offset1  = 0;
-
-    // Clamp xi
-    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
-    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
-
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
-    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    xi_offset0 *= (int8)src_stride_y;
-    xi_offset1 *= (int)src_stride_y;
-
-    // Out-of-bound condition for X
-    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
-    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-    IM2COL1x9(0);
-    IM2COL1x9(1);
-    IM2COL1x9(2);
-    IM2COL1x9(3);
-    IM2COL1x9(4);
-    IM2COL1x9(5);
-    IM2COL1x9(6);
-    IM2COL1x9(7);
-    IM2COL1x9(8);
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int i = 0;
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        // Clamp yi_coord
-        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
-        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
-
-        // Out-of-bound condition for Y
-        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
-
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-        {
-            // Clamp xi_coord
-            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
-            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
-
-            // Out-of-bound condition for X
-            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
-
-            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-            // Replace with PAD_VALUE if the value is out-of-bound
-            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-            // Store in a boundary-aware way to avoid padding
-#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            const bool at_channel_boundary = get_global_id(0) == 0;
-            if(at_channel_boundary)
-            {
-                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            else // at_channel_boundary
-#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            {
-                VSTORE(VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            i++;
-        }
-    }
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
index ff4dc8ec38..deada49db5 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "tile_helpers.h"
 
 #define MUL_OP(x, y) ((x) * (y))
 #define ADD_OP(x, y) ((x) + (y))
@@ -29,9 +30,6 @@
 #define POW_OP(x, y) pow((x), (y))
 #define SQCVT_SAT(a) (a)
 
-#define LOAD_OP(offset, ptr) vload4(offset, ptr)
-#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
-
 #if defined(NUM_SLICES)
 /** Apply cross-map normalization.
  *
@@ -58,8 +56,8 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
-                                            TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
 {
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -80,7 +78,7 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
     for(int i = left_slice; i <= right_slice; i++)
     {
         VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
         acc    = ADD_OP(acc, MUL_OP(values, values));
     }
 
@@ -88,9 +86,10 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     normalized = POW_OP(acc, beta_v);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
 
-    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 #endif /* defined(NUM_SLICES) */
 
@@ -101,6 +100,7 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
  * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
  * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
  *
  * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
  * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
@@ -126,17 +126,16 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    acc = 0;
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    coeff_v = SQCVT_SAT(COEFF);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    beta_v = SQCVT_SAT(BETA);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+    kappa_v = SQCVT_SAT(KAPPA);
 
-    const int current_col = get_global_id(0) << 2;
-    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+    const int left_pos  = -(int)RADIUS;
+    const int right_pos = (int)RADIUS;
 
 #if defined(IN_MAP_2D)
     const int current_row = get_global_id(1);
@@ -152,90 +151,10 @@ __kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
         {
 #if defined(IN_MAP_2D)
             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
-#else  /* defined(IN_MAP_2D) */
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
-#endif /* defined(IN_MAP_2D) */
-            acc = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(WIDTH_SIZE)
-
-#if defined(NUM_SLICES)
-/** Apply in-map normalization when tensors are in the NHWC data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
-
-    const int current_cols = get_global_id(1);
-    const int first_col    = max(-(int)RADIUS, -current_cols);
-    const int last_col     = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols);
-
-#if defined(IN_MAP_2D)
-    const int current_rows = get_global_id(2);
-    const int first_row    = max(-(int)RADIUS, -current_rows);
-    const int last_row     = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#endif /* defined(IN_MAP_2D) */
-        for(int i = first_col; i <= last_col; ++i)
-        {
-#if defined(IN_MAP_2D)
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j));
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
 #else  /* defined(IN_MAP_2D) */
             VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0));
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
 #endif /* defined(IN_MAP_2D) */
             acc = ADD_OP(acc, MUL_OP(values, values));
         }
@@ -247,8 +166,9 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     normalized = POW_OP(acc, beta_v);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
 
-    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
-#endif /* defined(NUM_SLICES) */
+#endif // defined(WIDTH_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
index f803f5288e..23a0de76f7 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,56 +79,4 @@ __kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
     VSTORE(VEC_SIZE)
     (res, 0, (__global DATA_TYPE *)dst.ptr);
 }
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(0);
-
-    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
-    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
-    TYPE res  = (data - curr_mean) / curr_std;
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
index 27017a08ca..0f02ef6184 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,17 +76,21 @@ __kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
 
     const uint current_slice = get_global_id(2) % NUM_CHANNELS;
 
-    float16 curr_mean_flt = (float16)(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
 
-    float16 curr_std_flt = (float16)(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
 
-    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
-    data_flt         = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
 
     // Perform normalization
-    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
 
     const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
     VSTORE(VEC_SIZE)
@@ -94,65 +98,4 @@ __kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
 }
 
 #endif // defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(0);
-
-    float16 curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
-    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    float16 curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
-    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
-    data_flt         = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
-
-    // Perform normalization
-    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    VSTORE(VEC_SIZE)
-    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..15ad116289
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#if defined(QUANTIZED)
+#define POOL_OP(x, y) (max((x), (y)))
+#else // defined(QUANTIZED)
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif // defined(QUANTIZED)
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION) || defined(QUANTIZED)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N  (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32/QASYMM8;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32/QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int id0 = get_global_id(0);
+    int id1 = get_global_id(1);
+    int id2 = get_global_id(2);
+
+    int x_coords = (id0 * STRIDE_X) - PAD_X;
+    int y_coords = (id1 * STRIDE_Y) - PAD_Y;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + y_coords * (int)src_stride_y + id2 * src_stride_z;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+    const int end_x = min((int)POOL_SIZE_X, (int)(SRC_WIDTH - x_coords));
+    const int end_y = min((int)POOL_SIZE_Y, (int)(SRC_HEIGHT - y_coords));
+
+    // Load data
+    for(int y = 0; y < end_y; ++y)
+    {
+        if((y_coords + y) >= 0)
+        {
+            int x = 0;
+            for(; x <= (end_x - 8); x += 8)
+            {
+                int8 src_x = (int8)(x_coords + x) + VEC_OFFS(int, 8);
+#if defined(POOL_AVG) || defined(POOL_L2)
+                SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                cond_x = CONVERT(src_x < 0, SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8));
+                src_x  = clamp(src_x, (int8)0, (int8)(SRC_WIDTH - 1));
+                VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                data0 = select(VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y)), (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))0, REVERSE(cond_x, 8));
+#else  // defined(POOL_AVG) || defined(POOL_L2)
+                src_x = clamp(src_x, 0, SRC_WIDTH - 1);
+                VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+                data0               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y));
+#endif // defined(POOL_AVG) || defined(POOL_L2
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif /* defined(POOL_L2) */
+
+                vdata = POOL_OP(vdata, data0);
+            }
+
+            // Leftover
+            for(; x < end_x; ++x)
+            {
+                int src_x = x_coords + x;
+#if defined(POOL_AVG) || defined(POOL_L2)
+                SELECT_DATA_TYPE(ACC_DATA_TYPE)
+                cond_x              = (src_x < 0);
+                src_x               = clamp(src_x, 0, SRC_WIDTH - 1);
+                ACC_DATA_TYPE data0 = select((ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y))), (ACC_DATA_TYPE)0, cond_x);
+#else  // defined(POOL_AVG) || defined(POOL_L2)
+                src_x               = clamp(src_x, 0, SRC_WIDTH - 1);
+                ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y)));
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif /* defined(POOL_L2) */
+
+                sdata = POOL_OP(sdata, data0);
+            }
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(QUANTIZED)
+
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_q8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = result_q8;
+
+#else // defined(QUANTIZED)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = (DATA_TYPE)res;
+#endif // defined(QUANTIZED)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(indices))
+{
+    int id0 = get_global_id(0);
+    int id1 = get_global_id(1);
+    int id2 = get_global_id(2);
+
+    int2 x_coords = clamp((int2)((id0 * STRIDE_X) - PAD_X), (int2)0, (int2)(SRC_WIDTH - 1));
+    int2 y_coords = clamp((int2)((id1 * STRIDE_Y) - PAD_Y) + VEC_OFFS(int, 2), (int2)0, (int2)(SRC_HEIGHT - 1));
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + id2 * src_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s0 * sizeof(DATA_TYPE) + y_coords.s0 * (int)src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s1 * sizeof(DATA_TYPE) + y_coords.s1 * (int)src_stride_y));
+
+    // Perform calculations
+    DATA_TYPE data0_max = POOL_OP(data0.s0, data0.s1);
+    DATA_TYPE data1_max = POOL_OP(data1.s0, data1.s1);
+    DATA_TYPE res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = res;
+
+#if defined(SRC_BATCH)
+
+    uint offset_top    = (x_coords.s0 + y_coords.s0 * SRC_WIDTH + id2 * (SRC_WIDTH * SRC_HEIGHT)) % SRC_BATCH;
+    uint offset_bottom = offset_top + SRC_WIDTH;
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 * sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
+
+#endif // defined(SRC_BATCH)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
index de10decdec..7524ba7b4a 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
index 29344de37a..f66b17c1a6 100644
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,45 +72,4 @@ __kernel void reorg_layer_nchw(
     int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
     *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
 }
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(1);
-    int yo = get_global_id(2);
-    int zo = get_global_id(0);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
 #endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
 \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..2b4d6be9fb
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scale_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond);
+    cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    TILE(int, 1, 4, in_x);
+    TILE(int, 1, 4, in_y);
+    in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f));
+    in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f));
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scale_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    TILE(float, 1, 8, trans_coords);
+    TILE(float, 1, 8, floor_coords);
+    TILE(int, 1, 16, in_x);
+    TILE(int, 1, 16, in_y);
+
+    trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+    floor_coords[0].v = floor(trans_coords[0].v);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, 4,
+        {
+            in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2);
+            in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1);
+        })
+    })
+
+#if defined(BORDER_MODE_CONSTANT)
+    TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond);
+    cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1));
+    in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1));
+
+    TILE(DATA_TYPE, 1, 16, in_vals);
+
+    // Loads the values from the input image
+#if defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+    })
+#else  // defined(BORDER_MODE_CONSTANT)
+    LOOP_UNROLLING(int, i, 0, 1, 16,
+    {
+        in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y));
+    })
+#endif // defined(BORDER_MODE_CONSTANT)
+
+    TILE(float, 1, 8, a);
+    TILE(float, 1, 8, b);
+
+    a[0].v = trans_coords[0].v - floor_coords[0].v;
+    b[0].v = ((float8)(1.f)) - a[0].v;
+
+#if defined(OFFSET) && defined(SCALE)
+    TILE(float, 1, 16, in_vals_f32);
+    TILE(float, 1, 4, out_vals_f32);
+
+    in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE;
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+
+    TILE(DATA_TYPE, 1, 4, out_vals_4);
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = out_vals_4[0].s[i];
+    })
+#else  // defined(OFFSET) && defined(SCALE)
+
+    TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+    // Bilinear interpolation: (in0  * b0 * b1) + (in1  * a0 * b1) + (in2  * b0 * a1) + (in3  * a0 * a1)
+    //                         (in4  * b2 * b3) + (in5  * a2 * b3) + (in6  * b2 * a3) + (in7  * a2 * a3)
+    //                         (in8  * b4 * b5) + (in9  * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+    //                         (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+    LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+    {
+        out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+    })
+#endif // defined(OFFSET) && defined(SCALE)
+
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+    if(x == get_global_size(0) - 1)
+    {
+#if VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+    else
+    {
+#if VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else  // VEC_SIZE == 1
+        VSTORE(VEC_SIZE)
+        (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+    }
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
new file mode 100644
index 0000000000..91520213e8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..8097f65942
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 16 channels for 3x3 kernels
+    // 4 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Transform the input tile
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
new file mode 100644
index 0000000000..8c382183c3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -0,0 +1,1346 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z              = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
+
+    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
+
+    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    src_addr += src_stride_z;
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp4 = in_row4;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+    tmp4 -= in_row6;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp5 = in_row5 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp6 = in_row6 - in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp7 = in_row5 - in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out0 = 0.0f;
+    DATA_TYPE out1 = 0.0f;
+    DATA_TYPE out2 = 0.0f;
+    DATA_TYPE out3 = 0.0f;
+    DATA_TYPE out4 = 0.0f;
+    DATA_TYPE out5 = 0.0f;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
+    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
+    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
+    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
+    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
+    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+
+    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
+    DATA_TYPE k0 = d41.s0;
+    DATA_TYPE k1 = d41.s0;
+    DATA_TYPE k2 = d41.s0;
+    DATA_TYPE k3 = d41.s0;
+    DATA_TYPE k4 = d41.s0;
+    DATA_TYPE k5 = 0.0f;
+
+    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
+
+    // Row2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+
+    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
+    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
+    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
+    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
+    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
+    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+#if defined(SRC_DEPTH)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
+
+    // Row1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+
+    // Row3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *(dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *(dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int                                z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+    // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+    tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+    comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
new file mode 100644
index 0000000000..861ed50651
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute the 2x1 or 1x2 output tile
+    // out00 = d00 + d01 + d02
+    // out01 = d01 - d02 - d03
+
+    float out00 = d00 + d01 + d02;
+    float out01 = d01 - d02 - d03;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the 6x6 or 6x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (float)b;
+    out11 += (float)b;
+    out12 += (float)b;
+    out13 += (float)b;
+
+    out20 += (float)b;
+    out21 += (float)b;
+    out22 += (float)b;
+    out23 += (float)b;
+
+    out30 += (float)b;
+    out31 += (float)b;
+    out32 += (float)b;
+    out33 += (float)b;
+#endif // defined(HAS_BIAS)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
+    ({                                                                   \
+        comm_fact.s0 = d1 + d2;                                          \
+        comm_fact.s1 = d3 + d4;                                          \
+        comm_fact.s2 = d5 + d6;                                          \
+        \
+        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
+        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+        \
+        comm_fact.s0 = d1 - d2;                                          \
+        comm_fact.s1 = d3 - d4;                                          \
+        comm_fact.s2 = d5 - d6;                                          \
+        \
+        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute output address
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
+                                 B_VAL),
+                      VEC_DATA_TYPE(DATA_TYPE, 4));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the 4x4 output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
index d3231a59a1..b910a753a6 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,16 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Batch to space transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -39,6 +41,12 @@
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,31 +56,40 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void depth_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
+__kernel void batch_to_space_nhwc(
+    TENSOR4D_DECLARATION(input),
     const int batch_id,
-    TENSOR4D_DECLARATION(output))
+    VECTOR_DECLARATION(block_shape),
+    TENSOR3D_DECLARATION(output))
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
 
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
 }
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
 /** Batch to space transformation. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -90,22 +107,25 @@ __kernel void depth_to_space_nchw(
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void depth_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
+__kernel void batch_to_space_static_nhwc(
+    TENSOR4D_DECLARATION(input),
     const int batch_id,
-    TENSOR4D_DECLARATION(output))
+    TENSOR3D_DECLARATION(output))
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int x = get_global_id(1) + CROP_LEFT;
+    const int y = get_global_id(2) + CROP_TOP;
+    const int z = get_global_id(0);
 
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+    const int in_batch = batch_id + ((x % block_x) + (y % block_y) * (block_x)) * BATCH_SIZE;
+    const int in_x     = x / block_x;
+    const int in_y     = y / block_y;
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, in_batch));
 }
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_ptr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = 0;
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+    res0 = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+    // beta is not zero, hence we need to perform the addition
+    res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
new file mode 100644
index 0000000000..233beb3aa9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -0,0 +1,160 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    // Offset computation
+    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
+
+    uint z        = 0;
+    uint batch_id = 0;
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
+
+    // Load the values
+    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
+
+#if VEC_SIZE == 1
+    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1)
+    };
+#elif VEC_SIZE == 3
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2)
+    };
+#elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3)
+    };
+#elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7)
+    };
+#elif VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7),
+        *(input_ptr + in_channels.s8),
+        *(input_ptr + in_channels.s9),
+        *(input_ptr + in_channels.sa),
+        *(input_ptr + in_channels.sb),
+        *(input_ptr + in_channels.sc),
+        *(input_ptr + in_channels.sd),
+        *(input_ptr + in_channels.se),
+        *(input_ptr + in_channels.sf)
+    };
+#endif // VEC_SIZE == 1
+
+    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..84f8aa7263
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
new file mode 100644
index 0000000000..81ceeb8846
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, 1, M0, xi);
+    TILE(int, 1, M0, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[0].s[i] = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[0].s[i] = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
+        xi[0].s[i] -= PAD_LEFT;
+        yi[0].s[i] -= PAD_TOP;
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        TILE(int, 1, M0, my);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            int x_s    = xi[0].s[i] + xk;
+            int y_s    = yi[0].s[i] + yk;
+            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+            my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+        })
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+        {
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+        }
+
+        // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+        // Left-over accumulations
+        for(; ck < _ISRC_CHANNELS; ++ck)
+        {
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+        }
+#endif // defined(LEFTOVER_LOOP)
+    }
+
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    // Apply activation
+    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
new file mode 100644
index 0000000000..807b990e82
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution 3d.
+ *
+ * @note Data layout supported: NDHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left, top and front) must be passed at compile time using -DPAD_LEFT, -DPAD_TOP and -DPAD_FRONT (e.g. -DPAD_LEFT=2, -DPAD_TOP=2, -DPAD_FRONT=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2, -DSTRIDE_Z=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH, -DWEI_HEIGHT and -DWEI_DEPTH (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9, -DWEI_DEPTH=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64, -DSRC_DEPTH=32)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH, -DDST_HEIGHT and -DDST_DEPTH (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64, -DDST_DEPTH=32)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ *  - -DIS_QUANTIZED
+ *  - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ *  - The destination quantization shift e.g. -DDST_SHIFT=4
+ *  - The destination offset e.g. -DDST_OFFSET=4
+ *  - The source offset e.g. -DSRC_OFFSET=4
+ *  - The weights offset e.g. -DWEI_OFFSET=4
+ *  - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time along with its tensor type by using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=int).
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution3d_ndhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    TENSOR4D(wei, BUFFER)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IWEI_DEPTH WEI_DEPTH
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_DEPTH SRC_DEPTH
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_DEPTH DST_DEPTH
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH)
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT x DEPTH
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+    TILE(int, M0, 1, zi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[i].v = (((mout + i) / _IDST_WIDTH) % _IDST_HEIGHT) * STRIDE_Y;
+        zi[i].v = (((mout + i) / (_IDST_WIDTH * _IDST_HEIGHT)) % _IDST_DEPTH) * STRIDE_Z;
+
+        xi[i].v -= PAD_LEFT;
+        yi[i].v -= PAD_TOP;
+        zi[i].v -= PAD_FRONT;
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = (ACC_DATA_TYPE)0;
+    })
+
+    for(int i = 0; i < _IY_MULTIPLIER; ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = (i / _IWEI_WIDTH) % _IWEI_HEIGHT;
+        int zk = i / (_IWEI_WIDTH * _IWEI_HEIGHT);
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE(DATA_TYPE, M0, K0, a);
+            TILE(DATA_TYPE, N0, K0, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, K0, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+            // Load tile from the weights tensor
+            const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                if((cout + i) < _IDST_CHANNELS)
+                {
+                    LOOP_UNROLLING(int, j, 0, 1, K0,
+                    {
+                        b[i].s[j] = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + j * wei_stride_y + b_offs * wei_stride_y);
+                    })
+                }
+            })
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ck += K0;
+        }
+
+#if((_ISRC_CHANNELS % K0) != 0)
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE(DATA_TYPE, M0, 1, a);
+            TILE(DATA_TYPE, N0, 1, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, 1, BUFFER, src, bout, zk, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, _ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
+
+            // Load tile from the weights tensor
+            const int b_offs = k + (xk * _ISRC_CHANNELS) + (yk * _ISRC_CHANNELS * _IWEI_WIDTH) + (zk * _ISRC_CHANNELS * _IWEI_WIDTH * _IWEI_HEIGHT);
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                if((cout + i) < _IDST_CHANNELS)
+                {
+                    b[i].v = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) * sizeof(DATA_TYPE) + b_offs * wei_stride_y);
+                }
+            })
+
+            // // Compute the matrix multiplication between two tiles
+            T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ++ck;
+        }
+#endif // ((_ISRC_CHANNELS % K0) != 0)
+    }
+
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    if((cout + N0) <= _IDST_CHANNELS)
+    {
+        bias0[0].v = VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, PARTIAL_N0)
+        (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
+    }
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH);
+    })
+
+#if defined(IS_QUANTIZED)
+    TILE(DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
new file mode 100644
index 0000000000..dcbae220b6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+// *INDENT-OFF*
+// clang-format off
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void dwc_native_fp_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Only the weight tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+#if _IWEI_HEIGHT < 5
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
+#endif // _IWEI_HEIGHT <= 5
+    {
+        TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = 0;
+        })
+
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), SRC_WIDTH, SRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+                c[m0].v += a[xk + m0].v * b[xk].v;
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+                c[m0].v = fma(a[xk + m0].v, b[xk].v, c[m0].v);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+            })
+        })
+    }
+#if _IWEI_HEIGHT < 5
+                      )
+#endif // _IWEI_HEIGHT <= 5
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+    T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(DST_WIDTH) - 1);
+            VSTORE(N0)
+            (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
new file mode 100644
index 0000000000..2d255e5b61
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// *INDENT-OFF*
+// clang-format off
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128)
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
+    ({})
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                     \
+    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                           \
+    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                     \
+    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout, 0, 0, 0, multipliers); \
+    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout, 0, 0, 0, shifts);
+
+#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
+#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
+
+#if defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for quantized data types
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively
+ * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3)
+ * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1)
+ * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+ *
+ * @param[in]  src_img                                       (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                                  Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                                         The size of the channels dimension of the source tensor
+ * @param[in]  src_w                                         The size of the width dimension of the source tensor
+ * @param[in]  src_h                                         The size of the height dimension of the source tensor
+ * @param[in]  src_n                                         The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
+ * @param[out] dst_img                                       (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                                  Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                                         The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                                         The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                         The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                         The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
+ * @param[in]  wei_img                                       (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                                         The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                                         The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                                         The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                                         The size of the batches dimension of the weights tensor
+ * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
+ * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_multipliers_stride_x                      Stride of the destination multipliers tensor in X dimension (in bytes)
+ * @param[in]  dst_multipliers_step_x                        dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor
+ * @param[in]  dst_shifts_ptr                                Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_shifts_stride_x                           Stride of the destination shifts tensor in X dimension (in bytes)
+ * @param[in]  dst_shifts_step_x                             dst_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_shifts_offset_first_element_in_bytes      The offset of the first element in the destination shifts tensor
+ * @param[in]  bia_ptr                                       (Optional) Pointer to the bias tensor Supported data type: S32
+ * @param[in]  bia_stride_x                                  (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                                    (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes             (Optional) The offset of the first element in the bias tensor
+ */
+//! @endcond
+__kernel void dwc_native_quantized_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE),
+    VECTOR_DECLARATION(dst_multipliers),
+    VECTOR_DECLARATION(dst_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Only the weight tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0_A        // Cols tile A. It can be either 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    // Reset accumulators
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+#if _IWEI_HEIGHT <= 5
+    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+    for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
+                   {
+                       TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+                       LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+    {
+        a[i].v = ZERO_VALUE;
+    })
+
+    // Load tile from the src tensor (TILE A)
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, (cout / DEPTH_MULTIPLIER), src_w, src_h, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+    TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+    // Load tile from the weights tensor (TILE B)
+    T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout, yk * _IM0_B, 1, wei_stride_y, b);
+
+    // Optimized path for STRIDE_X == 1
+    // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+#if _IWEI_WIDTH <= 16
+#define DOT_DATA_TYPE SRC_DATA_TYPE
+#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
+
+            // Optimized path for the dot instruction
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+            TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+            ACC_DATA_TYPE offset_a = 0;
+            ACC_DATA_TYPE offset_b = 0;
+
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                x0[0].s[xk] = a[xk + m0].s[n0];
+                y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+            })
+            DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+            REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+            c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+#else  // _IWEI_WIDTH <= 16
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
+            })
+#endif // _IWEI_WIDTH <= 16
+        })
+    })
+                   }
+#if _IWEI_HEIGHT <= 5
+                  )
+#endif // _IWEI_HEIGHT <= 5
+
+#if _IWEI_WIDTH <= 16
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+#endif // _IWEI_WIDTH <= 16
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    // Load bias
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+    T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+
+    // Quantize the tile
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+
+    // Perform activation
+    T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    if(x_cond)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE_PARTIAL(N0, PARTIAL_N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            int xi_out = min(xo + M0 - 1 - m0, (int)(dst_w) - 1);
+            VSTORE(N0)
+            (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)cout * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+        })
+    }
+}
+#endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+// *INDENT-ON*
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..a23e943fab
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+#if defined(IM2COL_3X3) || defined(IM2COL_9X9)
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ *  @name IM2COL1X9_NHWC_STORE
+ *
+ *  @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE            Data type of @p DATA
+ * @param[in] SRC_DEPTH            Input channel size / depth
+ * @param[in] DATA                 Value variable base name
+ * @param[in] ROW                  The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR           Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
+    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
+    if(at_channel_boundary)                                                                                          \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    }                                                                                                                \
+    else                                                                                                             \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
+    }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+#endif // defined(IM2COL_3X3) || defined(IM2COL_9X9)
+
+#if defined(IM2COL_3X3)
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int3 offset   = 0;
+
+    // Clamp xi
+    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+    xi_offset *= (int3)src_stride_y;
+
+    // Out-of-bound condition for X
+    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+    // yi == 0
+    // Clamp yi
+    // yi_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+    // also causes y_cond (y padding condition) to be satisfied
+    yi_coord = yi - (int)PAD_TOP;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 1
+    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with zeros
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 2
+    // Clamp yi_coord
+    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // Store in a boundary-aware way to avoid padding
+    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_3X3)
+
+#if defined(IM2COL_9X9)
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_9X9)
+
+#if defined(IM2COL_GENERIC)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (yo % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (yo / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    const int stride_x         = ch * sizeof(DATA_TYPE);
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + stride_x + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + stride_x + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int i = 0;
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        // Clamp yi_coord
+        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+        yi_coord     = clamp(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+        // Out-of-bound condition for Y
+        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+        {
+            // Clamp xi_coord
+            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+            xi_coord     = clamp(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+            // Out-of-bound condition for X
+            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+            // Replace with PAD_VALUE if the value is out-of-bound
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+            // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            const bool at_channel_boundary = get_global_id(0) == 0;
+            if(at_channel_boundary)
+            {
+                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            else // at_channel_boundary
+#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            {
+                VSTORE(VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            i++;
+        }
+    }
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(IM2COL_GENERIC)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
new file mode 100644
index 0000000000..aa719bfef0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution 2d indirect buffer.
+ *
+ * @note This kernel only works for unit batch_size
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The kernel width must be passed at compile time using -DWEI_CONV_WIDTH (e.g. -DWEI_CONV_WIDTH=9)
+ * @note The spatial dimensions of the source tensor used by conv2d must be passed at compile time using -DSRC_CONV_WIDTH and -DSRC_CONV_HEIGHT (e.g. -DSRC_CONV_WIDTH=96, -DSRC_CONV_HEIGHT=64)
+ * @note The width dimension of the destination tensor produced by conv2d must be passed at compile time using -DDST_CONV_WIDTH (e.g. -DDST_CONV_WIDTH=96)
+ * @note The tensor type ("BUFFER" only) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
+ *
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: INT32
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void indirect_convolution_address_precalculation(
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE))
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Note: WIDTH = M0 x KernelWidth x KernelHeight
+
+    // m index
+    const int mi = x % M0;
+    // Kernel index
+    const int ki = x / M0;
+    // Kernel width coordinate
+    const int xk = ki % WEI_CONV_WIDTH;
+    // kernel height coordinate
+    const int yk = ki / WEI_CONV_WIDTH;
+
+    TILE(DST_DATA_TYPE, 1, 1, xi);
+    TILE(DST_DATA_TYPE, 1, 1, yi);
+    TILE(DST_DATA_TYPE, 1, 1, my);
+
+    const int mout = y * M0;
+
+    xi[0].s[0] = ((mout + mi) % DST_CONV_WIDTH) * STRIDE_X;
+    yi[0].s[0] = ((mout + mi) / DST_CONV_WIDTH) * STRIDE_Y;
+    xi[0].s[0] -= PAD_LEFT;
+    yi[0].s[0] -= PAD_TOP;
+
+    const int x_s = xi[0].s[0] + xk;
+    const int y_s = yi[0].s[0] + yk;
+    my[0].s[0]    = x_s + y_s * SRC_CONV_WIDTH;
+    my[0].s[0]    = my[0].s[0] + z * (int)(SRC_CONV_WIDTH * SRC_CONV_HEIGHT);
+    my[0].s[0]    = select(-1, my[0].s[0], x_s >= 0);
+    my[0].s[0]    = select(-1, my[0].s[0], x_s < SRC_CONV_WIDTH);
+    my[0].s[0]    = select(-1, my[0].s[0], y_s >= 0);
+    my[0].s[0]    = select(-1, my[0].s[0], y_s < SRC_CONV_HEIGHT);
+
+    VSTORE(1)
+    (my[0].s[0], 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DST_DATA_TYPE) + y * dst_stride_y + z * dst_stride_z));
+}
+#endif // defined(INDIRECT_CONVOLUTION_ADDRESS_PRECALCULATION)
+
+#if defined(INDIRECT_CONVOLUTION_NHWC)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the indirect convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The vector length used for loading the values from the indirect buffer should be passed at compile time using -DIND_BUFF_VEC_SIZE (e.g. -DIND_BUFF_VEC_SIZE=4)
+ * @note The activation function to fuse and corresponding A and B values should be passed at compile time using -DACTIVATION_TYPE, -DA_VAL, and -DB_VAL
+ *        (e.g. -DFUNCTION_TYPE=lu_brelu_op, -DA_VAL=3.0, and -DB_VAL=1.0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, and 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  off_img                           (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE
+ * @param[in]  off_ptr                           Pointer to the indirect buffer tensor. Supported data type: INT32
+ * @param[in]  off_stride_y                      Stride of the indirect buffer tensor in Y dimension (in bytes)
+ * @param[in]  off_stride_z                      Stride of the indirect buffer tensor in Z dimension (in bytes)
+ * @param[in]  off_stride_w                      Stride of the indirect buffer tensor in W dimension (in bytes)
+ * @param[in]  off_c                             The size of the channels dimension of the indirect buffer tensor
+ * @param[in]  off_w                             The size of the width dimension of the indirect buffer tensor
+ * @param[in]  off_h                             The size of the height dimension of the indirect buffer tensor
+ * @param[in]  off_n                             The size of the batches dimension of the indirect buffer tensor
+ * @param[in]  off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void indirect_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_RO_T(off, OFF_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    off_offset_first_element_in_bytes += get_global_id(1) * off_stride_y;
+    off_offset_first_element_in_bytes += bout * off_stride_z;
+
+    // Initialize the accumulators
+    TILE(DST_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        TILE(int, 1, IND_BUFF_VEC_SIZE, my);
+        T_LOAD(int, 1, IND_BUFF_VEC_SIZE, OFF_TENSOR_TYPE, off, i * M0, 0, 1, 0, my);
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+        {
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = 0.0;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = 0.0;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+        }
+
+        // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+        // Left-over accumulations
+        for(; ck < _ISRC_CHANNELS; ++ck)
+        {
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = 0.0;
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = 0.0;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, DST_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+        }
+#endif // defined(LEFTOVER_LOOP)
+    }
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+    // Apply activation
+    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    const bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, c, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IY_MULTIPLIER
+}
+#endif // defined(INDIRECT_CONVOLUTION_NHWC)
diff --git a/src/core/CL/cl_kernels/nhwc/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
new file mode 100644
index 0000000000..7e35e161c8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(WIDTH_SIZE)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int left_slice  = max((int)0, (int)x_offs - (int)RADIUS);
+    const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
+
+    for(int i = left_slice; i <= right_slice; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES) && defined(DIM1_SIZE)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs       = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    const int  current_cols = get_global_id(1);
+    const int  current_rows = get_global_id(2);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int first_col = max(0, current_cols - (int)RADIUS);
+    const int last_col  = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+
+#if defined(IN_MAP_2D)
+    const int first_row = max(0, current_rows - (int)RADIUS);
+    const int last_row  = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#else  // defined(IN_MAP_2D)
+    const int j = current_rows;
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
+            acc    = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..86c33499e2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    TYPE res0 = (data - curr_mean) / curr_std;
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..7bc3c15a63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
new file mode 100644
index 0000000000..4e5481d1db
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in]  input_step_v                         input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_stride_v                      Stride of the destination tensor in V dimension (in bytes)
+ * @param[in]  output_step_v                        output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc(
+    TENSOR5D_DECLARATION(input),
+    TENSOR5D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+    // The depth size dimension and the batch size dimension are collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+    int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+                                           output_stride_w + idx_out_n * output_stride_v;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+    int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+    // The start of width to consider in calculation should exclude padding
+    int pool_x_s = max((int)0, -idx_in_w);
+    // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+    int pool_z_s = max((int)0, -idx_in_d);
+    int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+    // The filter size with all padding in all directions considered.
+    int filter_size = pool_z_e * pool_y_e * pool_x_e;
+
+    // The end of width to consider in calculation should exclude PAD_X
+    pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+    pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+    pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+#if defined(EXCLUDE_PADDING)
+    filter_size = (pool_z_e - pool_z_s) * (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#endif  // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+    // Global pooling path
+    for(int z = 0; z < POOL_SIZE_Z; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = 0; y < POOL_SIZE_Y; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = 0; x < POOL_SIZE_X; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+    for(int z = pool_z_s; z < pool_z_e; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = pool_y_s; y < pool_y_e; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = pool_x_s; x < pool_x_e; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && POOL_SIZE_Z == SRC_DEPTH && PAD_X == 0 && PAD_Y == 0 && PAD_Z == 0
+                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+                data0;
+#if defined(FP_MIXED_PRECISION)
+                // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+                data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src)),
+                                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+                data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data0 *= data0;
+#endif // defined(POOL_L2)
+                res0 = POOL_OP(res0, data0);
+            }
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+
+
+   // Store result
+#if defined(QUANTIZED)
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#elif defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
new file mode 100644
index 0000000000..abf0db9d07
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX()
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG)  */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                 \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are QASYMM8_SIGNED, QASYMM8
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2
+ * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH
+ * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions
+ * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8_SIGNED, QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in]  input_step_v                         input_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_stride_v                      Stride of the destination tensor in V dimension (in bytes)
+ * @param[in]  output_step_v                        output_stride_v * number of elements along V processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_3d_layer_MxN_ndhwc_quantized(
+    TENSOR5D_DECLARATION(input),
+    TENSOR5D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be shrunk to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+
+    // The depth size dimension and the batch size dimension are collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH;
+    int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d *
+                                           output_stride_w + idx_out_n * output_stride_v;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y;
+    int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z;
+
+    // The start of width to consider in calculation should exclude padding
+    int pool_x_s = max((int)0, -idx_in_w);
+    // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right)
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h);
+    int pool_z_s = max((int)0, -idx_in_d);
+    int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = pool_z_e * pool_y_e * pool_x_e;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+    // The end of width to consider in calculation should exclude PAD_X
+    pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w);
+    pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h);
+    pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d);
+
+    for(int z = pool_z_s; z < pool_z_e; ++z)
+    {
+        int depth_offset_src = (z + idx_in_d) * input_stride_w;
+        for(int y = pool_y_s; y < pool_y_e; ++y)
+        {
+            int height_offset_src = (y + idx_in_h) * input_stride_z;
+#pragma unroll 8
+            for(int x = pool_x_s; x < pool_x_e; ++x)
+            {
+                int width_offset_src = (x + idx_in_w) * input_stride_y;
+
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                data;
+                VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+                data0;
+
+                data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src));
+                data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+                res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+                filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            }
+        }
+    }
+
+#if defined(POOL_AVG)
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else  // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+#pragma unroll 8
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else  // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+                                           indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
index d8cef2b4e6..46268a4a88 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,108 +51,6 @@
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
 
-int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int8 vdata = INITIAL_VALUE;
-    int  sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_TYPE(8)
-            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int data0      = convert_int(data);
-            sdata          = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
-    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
-    res          = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG)
-    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
-#endif /* defined(POOL_AVG) */
-
-    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-
-    const float result_f32   = convert_float(result_q8);
-    const float input_offset = (float)OFFSET_IN1;
-    const float input_scale  = (float)SCALE_IN1;
-    const float scale_out    = (float)SCALE_OUT;
-    const float offset_out   = (float)OFFSET_OUT;
-    const float in_f32       = (result_f32 - input_offset) * input_scale;
-    const float out_f32      = in_f32 / scale_out + offset_out;
-    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
-
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    *(__global DATA_TYPE *)output.ptr = result_q8;
-}
-
 #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
 /** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
  * -# max, -DPOOL_MAX must be passed at compile time
diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
index a1a2e4696b..a340b0b8a2 100644
--- a/src/core/CL/cl_kernels/flatten.cl
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,21 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH)
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
 
-/** This opencl kernel flattens the first 3 dimensions of the input tensor
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The width, height and depth of the input tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24, -DSRC_DEPTH=16
- * @note If the output has 3 dimensions, the 2nd dimension of the output tensor must be passed at compile time using -DDST_DIM1. e.g -DDST_DIM1=3
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -37,38 +45,32 @@
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void flatten(
-    TENSOR4D_DECLARATION(src),
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst))
 {
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
-    uint c  = get_global_id(2) % SRC_DEPTH; // input feature map
-    uint b0 = get_global_id(2) / SRC_DEPTH; // batch id
-    uint b1 = 0;
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
 
-#if defined(DST_DIM1)
-    uint b_tmp = b0;
-    b0         = b_tmp % DST_DIM1; // batch id0
-    b1         = b_tmp / DST_DIM1; // batch id1
-#endif                             // defined(DST_DIM1)
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
 
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (uint)SRC_WIDTH + c * (uint)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(
-                                     DATA_TYPE) + b0 * dst_stride_y + b1 * dst_stride_z;
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
 
-    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
 }
-#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
-\ No newline at end of file
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
new file mode 100644
index 0000000000..e071b0f192
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(SCALE_NEAREST_NEIGHBOUR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c                             The size of the channels dimension of the source tensor
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x                           The scale value to apply on the source width
+ * @param[in] scale_y                           The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_nearest_neighbour_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+    float     xi_f = ((xo + 0.5f) * scale_x);
+    float     yi_f = ((yo + 0.5f) * scale_y);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+#ifdef ALIGN_CORNERS
+    xi_f = round(xi_f);
+    yi_f = round(yi_f);
+#endif // ALIGN_CORNERS
+
+    const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+    const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
+
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+
+    TILE(uint, 1, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
+}
+#endif /* SCALE_NEAREST_NEIGHBOUR */
+
+#if defined(SCALE_BILINEAR)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
+ *
+ * @note In case of QASYMM8, the following extra information must be passed at compile time:
+ * - The source offset e.g. -DOFFSET=4
+ * - The source scale e.g. -DSCALE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale_x                           The scale value to apply on the source width
+ * @param[in]  scale_y                           The scale value to apply on the source height
+ */
+//! @endcond
+__kernel void scale_bilinear_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    const float scale_x,
+    const float scale_y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else                                                  // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                 // defined(BATCHED_EXECUTION)
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    float xi_f = (xo * scale_x);
+    float yi_f = (yo * scale_y);
+#elif SAMPLING_POLICY_CENTER
+    float     xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+    float     yi_f = ((yo + 0.5f) * scale_y - 0.5f);
+#else // SAMPLING_POLICY
+#error("Unsupported sampling policy");
+#endif // SAMPLING_POLICY
+
+    const int xi = (int)floor(xi_f);
+    const int yi = (int)floor(yi_f);
+
+    TILE(SRC_DATA_TYPE, 1, N0, in00);
+    TILE(SRC_DATA_TYPE, 1, N0, in01);
+    TILE(SRC_DATA_TYPE, 1, N0, in10);
+    TILE(SRC_DATA_TYPE, 1, N0, in11);
+
+    // Initialize the tiles to CONSTANT_VALUE
+    in00[0].v = CONSTANT_VALUE;
+    in01[0].v = CONSTANT_VALUE;
+    in10[0].v = CONSTANT_VALUE;
+    in11[0].v = CONSTANT_VALUE;
+
+#ifndef BORDER_MODE_REPLICATE
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
+#else  // BORDER_MODE_REPLICATE
+    const int xi0  = clamp(xi, 0, (int)src_w - 1);
+    const int yi0  = clamp(yi, 0, (int)src_h - 1);
+    const int xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    const int yi1  = clamp(yi + 1, 0, (int)src_h - 1);
+
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
+#endif // BORDER_MODE_REPLICATE
+
+    TILE(DST_DATA_TYPE, 1, N0, out);
+
+#if defined(IS_FLOATING_POINT)
+    const SRC_DATA_TYPE a  = (SRC_DATA_TYPE)(xi_f - (float)xi);
+    const SRC_DATA_TYPE b  = (SRC_DATA_TYPE)(1.f - a);
+    const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
+    const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
+
+    // Calculate the output
+    out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
+#else  // defined(IS_FLOATING_POINT)
+
+    const float a  = (xi_f - (float)xi);
+    const float b  = (1.f - a);
+    const float a1 = (yi_f - (float)yi);
+    const float b1 = (1.f - a1);
+
+    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
+                           (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+                           (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
+                           (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
+                           VEC_DATA_TYPE(DST_DATA_TYPE, N0));
+#endif // defined(IS_FLOATING_POINT)
+
+    TILE(uint, 1, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
+}
+#endif /* SCALE_BILINEAR */
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
index cb11786ac4..695bd4c217 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,75 +24,6 @@
 #include "helpers.h"
 
 #if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nchw(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
 /** Calculate the space to batch conversion. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -133,7 +64,7 @@ __kernel void space_to_batch_nhwc(
     const int batch_id,
     TENSOR3D_DECLARATION(output))
 {
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
     Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
     Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
     Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -165,62 +96,6 @@ __kernel void space_to_batch_nhwc(
 #endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
 
 #if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nchw(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
 /** Calculate the space to batch conversion. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -255,7 +130,7 @@ __kernel void space_to_batch_static_nhwc(
     const int batch_id,
     TENSOR3D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     int block_x = BLOCK_SHAPE_X;
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..10aac6d5fb
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
new file mode 100644
index 0000000000..1393537283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the transposed convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The transposed convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The transposed convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDST_CHANNELS=64)
+ * @note The tensor type (currently only "BUFFER" is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type (currently only "BUFFER" is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note If bias exists, the compile time argument -DHAS_BIAS should be passed
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ *
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels (IFM) dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels (OFM) dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                             The size of the channels (IFM) dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches (OFM) dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void transposed_convolution_nhwc(
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, 1, M0, xi);
+    TILE(int, 1, M0, yi);
+    TILE(int, 1, M0, xu);
+    TILE(int, 1, M0, yu);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xu[0].s[i] = ((mout + i) % _IDST_WIDTH) - PAD_LEFT;
+        yu[0].s[i] = ((mout + i) / _IDST_WIDTH) - PAD_TOP;
+        xi[0].s[i] = ceil(xu[0].s[i] / (float)STRIDE_X);
+        yi[0].s[i] = ceil(yu[0].s[i] / (float)STRIDE_Y);
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    // Flipped indices
+    const int x_start = _IWEI_WIDTH - (xi[0].s[0] * STRIDE_X - xu[0].s[0]) - 1;
+    const int y_start = _IWEI_HEIGHT - (yi[0].s[0] * STRIDE_Y - yu[0].s[0]) - 1;
+
+    for(int yk = y_start, yi_step = 0; yk >= 0; yk -= STRIDE_Y, ++yi_step)
+    {
+        for(int xk = x_start, xi_step = 0; xk >= 0; xk -= STRIDE_X, ++xi_step)
+        {
+            const int weights_y = cout * _IY_MULTIPLIER + yk * _IWEI_WIDTH + xk;
+
+            TILE(int, 1, M0, my);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                int x_s    = xi[0].s[i] + xi_step;
+                int y_s    = yi[0].s[i] + yi_step;
+                my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
+                my[0].s[i] = my[0].s[i] + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+                my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
+                my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
+                my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
+                my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
+            })
+
+            int ck = 0;
+            for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
+            {
+                TILE(SRC_DATA_TYPE, M0, K0, a);
+                TILE(WEI_DATA_TYPE, N0, K0, b);
+
+                // Initialize tiles
+                LOOP_UNROLLING(int, i, 0, 1, M0,
+                {
+                    a[i].v = ZERO_VALUE;
+                })
+
+                LOOP_UNROLLING(int, i, 0, 1, N0,
+                {
+                    b[i].v = ZERO_VALUE;
+                })
+
+                // Load tile from the src tensor
+                T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, ck, src_stride_y, my, a);
+
+                // Load tile from the weights tensor
+                T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+                // Compute the matrix multiplication between two tiles
+                T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+                // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+                // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+                T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+            }
+
+            // This #if directive should be removed in case of dynamic tensor support
+#if defined(LEFTOVER_LOOP)
+            // Left-over accumulations
+            for(; ck < _ISRC_CHANNELS; ++ck)
+            {
+                TILE(SRC_DATA_TYPE, M0, 1, a);
+                TILE(WEI_DATA_TYPE, N0, 1, b);
+
+                // Initialize tiles
+                LOOP_UNROLLING(int, i, 0, 1, M0,
+                {
+                    a[i].v = ZERO_VALUE;
+                })
+
+                // Load tile from the src tensor
+                // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+                T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, 1, BUFFER, src, ck, src_stride_y, my, a);
+
+                // Load tile from the weights tensor
+                // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+                T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, weights_y, _IY_MULTIPLIER, wei_stride_y, b);
+
+                // Compute the matrix multiplication between two tiles
+                T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+#if defined(IS_QUANTIZED)
+                // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+                // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+                T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+#endif // defined(IS_QUANTIZED)
+            }
+#endif // defined(LEFTOVER_LOOP)
+        }
+    }
+
+#if defined(IS_QUANTIZED)
+    const int total_pixels = floor((1 + y_start / (float)STRIDE_Y)) * floor(1 + x_start / (float)STRIDE_X);
+
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (total_pixels * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+#endif // defined(IS_QUANTIZED)
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+#if defined(IS_QUANTIZED)
+
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
index d0cc0f24b7..74b9674a88 100644
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,61 +23,6 @@
  */
 #include "helpers.h"
 
-/** This function applies upsample on an input image. (NCHW)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
-
-    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
-    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
-
 /** This function applies upsample on an input image. (NHWC)
  *
  * @attention The following variables must be passed at compile time:
@@ -132,4 +77,4 @@ __kernel void upsample_layer_nhwc(
     *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
     *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
 #endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
index 5c3bb8aa9b..45fbc1b641 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,6 @@
  */
 #include "helpers.h"
 
-#if defined(SRC_DIM_Z)
-
 #define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
     ({                                                                                                                             \
         out.s0 = -tmp.s0 / 36.f;                                                                                                   \
@@ -37,291 +35,9 @@
         out.s7 = tmp.s6;                                                                                                           \
     })
 
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0);
-    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3 = (w0.s2);
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1    = 0.0f;
-    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2    = 0.0f;
-    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3    = 0.0f;
-    out3.s0 = (w2.s0);
-    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 16 channels for 3x3 kernels
-    // 4 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0) / 16.f;
-    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
-    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
-    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s5 = (w0.s2) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
-    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (w2.s0) / 4.f;
-    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
-    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
-    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s5 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -344,10 +60,12 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
 
@@ -476,312 +194,11 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
 
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- *
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Transform the input tile
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00.s0;
-    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
-    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
-    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
-    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
-    out0.s7 = w01;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
-    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
-    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
-    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
-    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40.s0;
-    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
-    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
-    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
-    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
-    out7.s7 = w41;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -804,10 +221,12 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_4x4_5x5_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
 
@@ -1057,9 +476,12 @@ __kernel void winograd_filter_transform_4x4_5x5_nhwc(
     *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
+
 /** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -1082,10 +504,12 @@ __kernel void winograd_filter_transform_4x4_5x5_nhwc(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_2x2_7x7_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
 
@@ -1357,159 +781,13 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc(
     *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
-#endif // defined(SRC_DIM_Z)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
 
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1531,10 +809,12 @@ __kernel void winograd_filter_transform_4x1_5x1_nchw(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_4x1_3x1_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
                                            src_stride_x,
@@ -1553,12 +833,14 @@ __kernel void winograd_filter_transform_4x1_3x1_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC)
 
+#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1580,10 +862,12 @@ __kernel void winograd_filter_transform_4x1_3x1_nhwc(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_4x1_5x1_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
                                            src_stride_x,
@@ -1602,12 +886,14 @@ __kernel void winograd_filter_transform_4x1_5x1_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC)
 
+#if defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
  *
@@ -1629,10 +915,12 @@ __kernel void winograd_filter_transform_4x1_5x1_nhwc(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_2x1_7x1_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
                                            src_stride_x,
@@ -1651,161 +939,16 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC)
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1827,10 +970,12 @@ __kernel void winograd_filter_transform_1x4_1x5_nchw(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_1x4_1x3_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
                                            src_stride_x,
@@ -1849,12 +994,14 @@ __kernel void winograd_filter_transform_1x4_1x3_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC)
 
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1876,10 +1023,12 @@ __kernel void winograd_filter_transform_1x4_1x3_nhwc(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_1x4_1x5_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
                                            src_stride_x,
@@ -1898,12 +1047,14 @@ __kernel void winograd_filter_transform_1x4_1x5_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC)
 
+#if defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
 /** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
  *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
  *
@@ -1925,10 +1076,12 @@ __kernel void winograd_filter_transform_1x4_1x5_nhwc(
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_DIM_Z                         The third (Z) dimension of the src tensor
  */
 __kernel void winograd_filter_transform_1x2_1x7_nhwc(
     TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    const int SRC_DIM_Z)
 {
     winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
                                            src_stride_x,
@@ -1947,6 +1100,8 @@ __kernel void winograd_filter_transform_1x2_1x7_nhwc(
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
-                                           dst_offset_first_element_in_bytes);
+                                           dst_offset_first_element_in_bytes,
+                                           SRC_DIM_Z);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC)
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..7341336b92
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                                           \
+    ({                                                                                    \
+        comm_fact.s0 = tmp.s2 - (DATA_TYPE)4.25f * tmp.s4 + tmp.s6;                       \
+        comm_fact.s1 = tmp.s1 - (DATA_TYPE)4.25f * tmp.s3 + tmp.s5;                       \
+        comm_fact.s2 = (DATA_TYPE)2.5f * tmp.s3;                                          \
+        comm_fact.s3 = (DATA_TYPE)0.5f * tmp.s1 + (DATA_TYPE)2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = (DATA_TYPE)0.25f * tmp.s2 - (DATA_TYPE)1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = (DATA_TYPE)4.f * tmp.s2 + tmp.s6 - (DATA_TYPE)5.f * tmp.s4;        \
+        comm_fact.s6 = (DATA_TYPE)2.f * tmp.s1 + (DATA_TYPE)0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + (DATA_TYPE)5.25f * tmp.s4 - (DATA_TYPE)5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                                             \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                                             \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                                             \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                                             \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                                             \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                                             \
+        out.s7 = tmp.s7 - tmp.s1 + (DATA_TYPE)5.25f * tmp.s3 - (DATA_TYPE)5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                                                                \
+    ({                                                                                                                                         \
+        comm_fact.s0 = (DATA_TYPE)36.0f * tmp.s2 - (DATA_TYPE)13.0f * tmp.s4 + tmp.s6;                                                         \
+        comm_fact.s1 = (DATA_TYPE)36.0f * tmp.s1 - (DATA_TYPE)13.0f * tmp.s3 + (DATA_TYPE)1.0f * tmp.s5;                                       \
+        comm_fact.s2 = (DATA_TYPE)9.0f * tmp.s2 - (DATA_TYPE)10.0f * tmp.s4 + tmp.s6;                                                          \
+        comm_fact.s3 = (DATA_TYPE)18.0f * tmp.s1 - (DATA_TYPE)20.0f * tmp.s3 + (DATA_TYPE)2.0f * tmp.s5;                                       \
+        comm_fact.s4 = (DATA_TYPE)4.0f * tmp.s2 - (DATA_TYPE)5.0f * tmp.s4 + tmp.s6;                                                           \
+        comm_fact.s5 = (DATA_TYPE)12.0f * tmp.s1 - (DATA_TYPE)15.0f * tmp.s3 + (DATA_TYPE)3.0f * tmp.s5;                                       \
+        out.s0       = -(DATA_TYPE)36.0f * tmp.s0 + (DATA_TYPE)49.0f * tmp.s2 + -(DATA_TYPE)14.0f * tmp.s4 + tmp.s6;                           \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                                                            \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                                                            \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                                                            \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                                                            \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                                                            \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                                                            \
+        out.s7       = -(DATA_TYPE)36.0f * tmp.s1 + (DATA_TYPE)0.0f * tmp.s2 + (DATA_TYPE)49.0f * tmp.s3 - (DATA_TYPE)14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 6, N0, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 6, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 6, N0, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v *= (DATA_TYPE)4.0f;
+    })
+
+    com[0].v = in[2].v - (DATA_TYPE)4.f * in[0].v;
+    com[1].v = in[3].v - (DATA_TYPE)4.f * in[1].v;
+    com[2].v = in[4].v - (DATA_TYPE)4.f * in[2].v;
+    com[3].v = in[5].v - (DATA_TYPE)4.f * in[3].v;
+    com[4].v = in[3].v - in[1].v;
+    com[4].v = com[4].v + com[4].v;
+    com[5].v = in[4].v - in[2].v;
+
+    out[0].v = com[2].v - com[0].v;
+    out[1].v = com[2].v + com[1].v;
+    out[2].v = com[2].v - com[1].v;
+    out[3].v = com[5].v + com[4].v;
+    out[4].v = com[5].v - com[4].v;
+    out[5].v = com[3].v - com[1].v;
+
+    TILE(uint, 6, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 36, N0, in);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 6, 6, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 6, N0, com);
+    TILE(DATA_TYPE, 36, N0, tmp);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
+        tmp[i + 0 * 6].v = com[2].v - com[0].v;
+        tmp[i + 1 * 6].v = com[2].v + com[1].v;
+        tmp[i + 2 * 6].v = com[2].v - com[1].v;
+        tmp[i + 3 * 6].v = com[5].v + com[4].v;
+        tmp[i + 4 * 6].v = com[5].v - com[4].v;
+        tmp[i + 5 * 6].v = com[3].v - com[1].v;
+    })
+
+    TILE(DATA_TYPE, 36, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = tmp[i * 6 + 2].v - (DATA_TYPE)4.f *tmp[i * 6 + 0].v;
+        com[1].v         = tmp[i * 6 + 3].v - (DATA_TYPE)4.f *tmp[i * 6 + 1].v;
+        com[2].v         = tmp[i * 6 + 4].v - (DATA_TYPE)4.f *tmp[i * 6 + 2].v;
+        com[3].v         = tmp[i * 6 + 5].v - (DATA_TYPE)4.f *tmp[i * 6 + 3].v;
+        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+        out[i * 6 + 0].v = com[2].v - com[0].v;
+        out[i * 6 + 1].v = com[2].v + com[1].v;
+        out[i * 6 + 2].v = com[2].v - com[1].v;
+        out[i * 6 + 3].v = com[5].v + com[4].v;
+        out[i * 6 + 4].v = com[5].v - com[4].v;
+        out[i * 6 + 5].v = com[3].v - com[1].v;
+    })
+
+    // Compute destination address
+    TILE(uint, 36, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, N0, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 1, 8, com);
+
+    com[0].s[0] = in[2].v - (DATA_TYPE)4.25f * in[4].v + in[6].v;
+    com[0].s[1] = in[1].v - (DATA_TYPE)4.25f * in[3].v + in[5].v;
+    com[0].s[2] = (DATA_TYPE)0.5f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+    com[0].s[3] = (DATA_TYPE)0.25f * in[2].v - (DATA_TYPE)1.25f * in[4].v + in[6].v;
+    com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+    com[0].s[5] = (DATA_TYPE)2.0f * in[1].v - (DATA_TYPE)2.5f * in[3].v + (DATA_TYPE)0.5f * in[5].v;
+    out[0].s[0] = in[0].v - 5.25f * in[2].v + (DATA_TYPE)5.25f * in[4].v - in[6].v;
+    out[1].s[0] = com[0].s[0] + com[0].s[1];
+    out[2].s[0] = com[0].s[0] - com[0].s[1];
+    out[3].s[0] = com[0].s[3] + com[0].s[2];
+    out[4].s[0] = com[0].s[3] - com[0].s[2];
+    out[5].s[0] = com[0].s[4] + com[0].s[5];
+    out[6].s[0] = com[0].s[4] - com[0].s[5];
+    out[7].s[0] = -in[1].v + (DATA_TYPE)5.25f * in[3].v - (DATA_TYPE)5.25f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
+        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
+        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
+        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v + com[1].v;
+    tmp[2].v = com[0].v - com[1].v;
+    tmp[3].v = com[2].v + com[3].v;
+    tmp[4].v = com[2].v - com[3].v;
+    tmp[5].v = com[4].v + com[5].v;
+    tmp[6].v = com[4].v - com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = tmp[i].s[2] - (DATA_TYPE)4.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = tmp[i].s[1] - (DATA_TYPE)4.25f * tmp[i].s[3] + tmp[i].s[5];
+        com[0].s[2]         = (DATA_TYPE)0.5f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+        com[0].s[3]         = (DATA_TYPE)0.25f * tmp[i].s[2] - (DATA_TYPE)1.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[4]         = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = (DATA_TYPE)2.0f * tmp[i].s[1] - (DATA_TYPE)2.5f * tmp[i].s[3] + (DATA_TYPE)0.5f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = tmp[i].s[0] - (DATA_TYPE)5.25f * tmp[i].s[2] + (DATA_TYPE)5.25f * tmp[i].s[4] - tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 7].s[0] = -tmp[i].s[1] + (DATA_TYPE)5.25f * tmp[i].s[3] - (DATA_TYPE)5.25f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v *= (DATA_TYPE) - 36.0f;
+    })
+
+    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+    com[0].s[0] = (DATA_TYPE)36.0f * in[2].v - (DATA_TYPE)13.0f * in[4].v + in[6].v;
+    com[0].s[1] = (DATA_TYPE)36.0f * in[1].v - (DATA_TYPE)13.0f * in[3].v + (DATA_TYPE)1.0f * in[5].v;
+    com[0].s[2] = (DATA_TYPE)9.0f * in[2].v - (DATA_TYPE)10.0f * in[4].v + in[6].v;
+    com[0].s[3] = (DATA_TYPE)18.0f * in[1].v - (DATA_TYPE)20.0f * in[3].v + (DATA_TYPE)2.0f * in[5].v;
+    com[0].s[4] = (DATA_TYPE)4.0f * in[2].v - (DATA_TYPE)5.0f * in[4].v + in[6].v;
+    com[0].s[5] = (DATA_TYPE)12.0f * in[1].v - (DATA_TYPE)15.0f * in[3].v + (DATA_TYPE)3.0f * in[5].v;
+    out[0].s[0] = (DATA_TYPE) - 36.0f * in[0].v + (DATA_TYPE)49.0f * in[2].v + -(DATA_TYPE)14.0f * in[4].v + in[6].v;
+    out[1].s[0] = com[0].s[0] - com[0].s[1];
+    out[2].s[0] = com[0].s[0] + com[0].s[1];
+    out[3].s[0] = com[0].s[2] - com[0].s[3];
+    out[4].s[0] = com[0].s[2] + com[0].s[3];
+    out[5].s[0] = com[0].s[4] - com[0].s[5];
+    out[6].s[0] = com[0].s[4] + com[0].s[5];
+    out[7].s[0] = -(DATA_TYPE)36.0f * in[1].v + (DATA_TYPE)0.0f * in[2].v + (DATA_TYPE)49.0f * in[3].v - (DATA_TYPE)14.0f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v - com[1].v;
+    tmp[2].v = com[0].v + com[1].v;
+    tmp[3].v = com[2].v - com[3].v;
+    tmp[4].v = com[2].v + com[3].v;
+    tmp[5].v = com[4].v - com[5].v;
+    tmp[6].v = com[4].v + com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = (DATA_TYPE)36.0f * tmp[i].s[2] - (DATA_TYPE)13.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = (DATA_TYPE)36.0f * tmp[i].s[1] - (DATA_TYPE)13.0f * tmp[i].s[3] + (DATA_TYPE)1.0f * tmp[i].s[5];
+        com[0].s[2]         = (DATA_TYPE)9.0f * tmp[i].s[2] - (DATA_TYPE)10.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[3]         = (DATA_TYPE)18.0f * tmp[i].s[1] - (DATA_TYPE)20.0f * tmp[i].s[3] + (DATA_TYPE)2.0f * tmp[i].s[5];
+        com[0].s[4]         = (DATA_TYPE)4.0f * tmp[i].s[2] - (DATA_TYPE)5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = (DATA_TYPE)12.0f * tmp[i].s[1] - (DATA_TYPE)15.0f * tmp[i].s[3] + (DATA_TYPE)3.0f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = (DATA_TYPE) - 36.0f * tmp[i].s[0] + (DATA_TYPE)49.0f * tmp[i].s[2] + -(DATA_TYPE)14.0f * tmp[i].s[4] + tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 7].s[0] = -(DATA_TYPE)36.0f * tmp[i].s[1] + (DATA_TYPE)0.0f * tmp[i].s[2] + (DATA_TYPE)49.0f * tmp[i].s[3] - (DATA_TYPE)14.0f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ *
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH                       The src tensor's width
+ * @param[in] _ISRC_HEIGHT                      The src tensor's height
+ * @param[in] _INUM_TILES_X                     The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y                     The number of tiles in the Y dimension
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+    const int _ISRC_WIDTH,
+    const int _ISRC_HEIGHT,
+    const int _INUM_TILES_X,
+    const int _INUM_TILES_Y)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes,
+                                                 _ISRC_WIDTH,
+                                                 _ISRC_HEIGHT,
+                                                 _INUM_TILES_X,
+                                                 _INUM_TILES_Y);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+#endif // defined(NHWC)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..9eb995fbb2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note  must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  _ISRC_HEIGHT                      The source tensor's height
+ * @param[in]  _IDST_WIDTH                       The destination tensor's width
+ * @param[in]  _IDST_HEIGHT                      The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int _ISRC_HEIGHT,
+    const int _IDST_WIDTH,
+    const int _IDST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 2, N0, out);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 8 channels to compose the 8x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out0 and out01
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+    out[1].v = -in[1].v + in[2].v - (DATA_TYPE)2.f * in[3].v + (DATA_TYPE)2.0f * in[4].v - (DATA_TYPE)3.0f * in[5].v + (DATA_TYPE)3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 2,
+    {
+        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 16, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 64 channels to compose the 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - (DATA_TYPE)2 * in[24 + i].v + (DATA_TYPE)2 * in[32 + i].v + (DATA_TYPE) - 3 * in[40 + i].v + (DATA_TYPE)3 * in[48 + i].v + in[56 + i].v;
+    })
+
+    // Compute the 2x2 output tile
+    LOOP_UNROLLING(int, i, 0, 1, 2,
+    {
+        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - (DATA_TYPE)2 * tmp[6 + i].v + (DATA_TYPE)2 * tmp[8 + i].v - (DATA_TYPE)3 * tmp[10 + i].v + (DATA_TYPE)3 * tmp[12 + i].v + tmp[14 + i].v;
+    })
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 2,
+        {
+            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(uint, 6, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out00, out01, out02 and out03
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+    out[1].v = in[1].v - in[2].v + (DATA_TYPE)2.0f * in[3].v - (DATA_TYPE)2.0f * in[4].v;
+    out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * in[3].v + (DATA_TYPE)4.0f * in[4].v;
+    out[3].v = in[1].v - in[2].v + (DATA_TYPE)8.0f * in[3].v - (DATA_TYPE)8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 36, N0, in);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 36, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        tmp[0].v     = in[6 + i].v + in[12 + i].v;
+        tmp[1].v     = in[6 + i].v - in[12 + i].v;
+        tmp[2].v     = in[18 + i].v + in[24 + i].v;
+        tmp[3].v     = in[18 + i].v - in[24 + i].v;
+        tmp[3].v     = tmp[3].v + tmp[3].v;
+        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
+        in[6 + i].v  = tmp[3].v + tmp[1].v;
+        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
+        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
+        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
+        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
+        tmp[3].v         = tmp[3].v + tmp[3].v;
+        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+#if defined(IS_BATCHED)
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+#else                                          // defined(IS_BATCHED)
+    const int bout = 0; // BATCH SIZE IDX
+#endif                                         // defined(IS_BATCHED)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" contains 1x8 or 8x1 tile here
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in, and in this degenerate case out consists of 1 column/row
+    tmp[0].v = in[1].v - in[2].v;
+    tmp[1].v = (DATA_TYPE)2.0f * (in[3].v - in[4].v);
+    tmp[2].v = (DATA_TYPE)2.0f * (in[5].v + in[6].v);
+    tmp[3].v = in[3].v + in[4].v;
+    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + (DATA_TYPE)4.0f * tmp[2].v;
+    out[1].v = tmp[0].v + tmp[1].v + (DATA_TYPE)4.0f * (in[5].v - in[6].v);
+    out[2].v = in[1].v + in[2].v + (DATA_TYPE)4.0f * tmp[3].v + tmp[2].v;
+    out[3].v = tmp[0].v + (DATA_TYPE)4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 6, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" here is 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[0].v = in[8 + i].v + in[16 + i].v;
+        tmp[1].v = in[8 + i].v - in[16 + i].v;
+        tmp[2].v = in[24 + i].v + in[32 + i].v;
+        tmp[3].v = in[24 + i].v - in[32 + i].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[40 + i].v + in[48 + i].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+        // 4x8 matrix as a result
+        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    // in * A, with in = A^T * in as above
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+        // 4x4 tile
+        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  SRC_HEIGHT                        The source tensor's height
+ * @param[in]  DST_WIDTH                         The destination tensor's width
+ * @param[in]  DST_HEIGHT                        The destination tensor's height
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int       dst_size,
+    const int SRC_HEIGHT,
+    const int DST_WIDTH,
+    const int DST_HEIGHT)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size,
+                                           SRC_HEIGHT,
+                                           DST_WIDTH,
+                                           DST_HEIGHT);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
deleted file mode 100644
index 93c5024c52..0000000000
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-/** This function applies a non linear filter on a 3x3 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(top, max(middle, bottom));
-    uchar8  out = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top    = vload8(0, offset(&src, 0, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar8  bottom = vload8(0, offset(&src, 0, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_3(middle);
-    uchar8 out        = min(tmp_middle, min(top, bottom));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_3(middle);
-    uchar8  out        = max(tmp_middle, max(top, bottom));
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = middle.s01234567;
-    uchar8 p2  = middle.s12345678;
-    uchar8 p3  = middle.s23456789;
-    uchar8 p4  = bottom.s01234567;
-    uchar8 out = sort5(p0, p1, p2, p3, p4);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 3x3 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk3x3(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top    = vload16(0, offset(&src, -1, -1));
-    uchar16 middle = vload16(0, offset(&src, -1, 0));
-    uchar16 bottom = vload16(0, offset(&src, -1, 1));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(top, min(middle, bottom));
-    uchar8  out = row_reduce_min_3(tmp);
-#elif defined(MAX)
-    uchar16 tmp        = max(top, max(middle, bottom));
-    uchar8  out        = row_reduce_max_3(tmp);
-#elif defined(MEDIAN)
-    uchar8 p0  = top.s01234567;
-    uchar8 p1  = top.s12345678;
-    uchar8 p2  = top.s23456789;
-    uchar8 p3  = middle.s01234567;
-    uchar8 p4  = middle.s12345678;
-    uchar8 p5  = middle.s23456789;
-    uchar8 p6  = bottom.s01234567;
-    uchar8 p7  = bottom.s12345678;
-    uchar8 p8  = bottom.s23456789;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
deleted file mode 100644
index 7c87284a72..0000000000
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "non_linear_filter_helpers.h"
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 8 disks of diameter 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top.s01234567;
-    uchar8 p4  = top.s12345678;
-    uchar8 p5  = top.s23456789;
-    uchar8 p6  = top.s3456789A;
-    uchar8 p7  = top.s456789AB;
-    uchar8 p8  = middle.s01234567;
-    uchar8 p9  = middle.s12345678;
-    uchar8 p10 = middle.s23456789;
-    uchar8 p11 = middle.s3456789A;
-    uchar8 p12 = middle.s456789AB;
-    uchar8 p13 = bottom.s01234567;
-    uchar8 p14 = bottom.s12345678;
-    uchar8 p15 = bottom.s23456789;
-    uchar8 p16 = bottom.s3456789A;
-    uchar8 p17 = bottom.s456789AB;
-    uchar8 p18 = bottom2.s01234567;
-    uchar8 p19 = bottom2.s12345678;
-    uchar8 p20 = bottom2.s23456789;
-
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p4, p5);
-    SORT(p6, p7);
-    SORT(p8, p9);
-    SORT(p10, p11);
-    SORT(p12, p13);
-    SORT(p14, p15);
-    SORT(p16, p17);
-    SORT(p18, p19);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p4, p6);
-    SORT(p5, p7);
-    SORT(p8, p10);
-    SORT(p9, p11);
-    SORT(p12, p14);
-    SORT(p13, p15);
-    SORT(p16, p18);
-    SORT(p17, p19);
-    SORT(p1, p2);
-    SORT(p5, p6);
-    SORT(p0, p4);
-    SORT(p3, p7);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p8, p12);
-    SORT(p11, p15);
-    SORT(p17, p18);
-    SORT(p16, p20);
-    SORT(p1, p5);
-    SORT(p2, p6);
-    SORT(p9, p13);
-    SORT(p10, p14);
-    SORT(p0, p8);
-    SORT(p7, p15);
-    SORT(p17, p20);
-    SORT(p1, p4);
-    SORT(p3, p6);
-    SORT(p9, p12);
-    SORT(p11, p14);
-    SORT(p18, p20);
-    SORT(p0, p16);
-    SORT(p2, p4);
-    SORT(p3, p5);
-    SORT(p10, p12);
-    SORT(p11, p13);
-    SORT(p1, p9);
-    SORT(p6, p14);
-    SORT(p19, p20);
-    SORT(p3, p4);
-    SORT(p11, p12);
-    SORT(p1, p8);
-    SORT(p2, p10);
-    SORT(p5, p13);
-    SORT(p7, p14);
-    SORT(p3, p11);
-    SORT(p2, p8);
-    SORT(p4, p12);
-    SORT(p7, p13);
-    SORT(p1, p17);
-    SORT(p3, p10);
-    SORT(p5, p12);
-    SORT(p1, p16);
-    SORT(p2, p18);
-    SORT(p3, p9);
-    SORT(p6, p12);
-    SORT(p2, p16);
-    SORT(p3, p8);
-    SORT(p7, p12);
-    SORT(p5, p9);
-    SORT(p6, p10);
-    SORT(p4, p8);
-    SORT(p7, p11);
-    SORT(p3, p19);
-    SORT(p5, p8);
-    SORT(p7, p10);
-    SORT(p3, p18);
-    SORT(p4, p20);
-    SORT(p6, p8);
-    SORT(p7, p9);
-    SORT(p3, p17);
-    SORT(p5, p20);
-    SORT(p7, p8);
-    SORT(p3, p16);
-    SORT(p6, p20);
-    SORT(p5, p17);
-    SORT(p7, p20);
-    SORT(p4, p16);
-    SORT(p6, p18);
-    SORT(p5, p16);
-    SORT(p7, p19);
-    SORT(p7, p18);
-    SORT(p6, p16);
-    SORT(p7, p17);
-    SORT(p10, p18);
-    SORT(p7, p16);
-    SORT(p9, p17);
-    SORT(p8, p16);
-    SORT(p9, p16);
-    SORT(p10, p16);
-
-    return p10;
-}
-
-/** Sorting network to sort 8 boxes of size 5 and return their median.
- *
- * @param[in] top2    Values of elements two rows above.
- * @param[in] top     Values of elements one row above.
- * @param[in] middle  Values of middle elements.
- * @param[in] bottom  Values of elements one row below.
- * @param[in] bottom2 Values of elements two rows below.
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
-{
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top2.s12345678;
-    uchar8 p2  = top2.s23456789;
-    uchar8 p3  = top2.s3456789A;
-    uchar8 p4  = top2.s456789AB;
-    uchar8 p5  = top.s01234567;
-    uchar8 p6  = top.s12345678;
-    uchar8 p7  = top.s23456789;
-    uchar8 p8  = top.s3456789A;
-    uchar8 p9  = top.s456789AB;
-    uchar8 p10 = middle.s01234567;
-    uchar8 p11 = middle.s12345678;
-    uchar8 p12 = middle.s23456789;
-    uchar8 p13 = middle.s3456789A;
-    uchar8 p14 = middle.s456789AB;
-    uchar8 p15 = bottom.s01234567;
-    uchar8 p16 = bottom.s12345678;
-    uchar8 p17 = bottom.s23456789;
-    uchar8 p18 = bottom.s3456789A;
-    uchar8 p19 = bottom.s456789AB;
-    uchar8 p20 = bottom2.s01234567;
-    uchar8 p21 = bottom2.s12345678;
-    uchar8 p22 = bottom2.s23456789;
-    uchar8 p23 = bottom2.s3456789A;
-    uchar8 p24 = bottom2.s456789AB;
-
-    SORT(p1, p2);
-    SORT(p0, p1);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p3, p4);
-    SORT(p4, p5);
-    SORT(p0, p3);
-    SORT(p2, p5);
-    SORT(p2, p3);
-    SORT(p1, p4);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p7, p8);
-    SORT(p6, p7);
-    SORT(p7, p8);
-    SORT(p10, p11);
-    SORT(p9, p10);
-    SORT(p10, p11);
-    SORT(p6, p9);
-    SORT(p8, p11);
-    SORT(p8, p9);
-    SORT(p7, p10);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p0, p6);
-    SORT(p4, p10);
-    SORT(p4, p6);
-    SORT(p2, p8);
-    SORT(p2, p4);
-    SORT(p6, p8);
-    SORT(p1, p7);
-    SORT(p5, p11);
-    SORT(p5, p7);
-    SORT(p3, p9);
-    SORT(p3, p5);
-    SORT(p7, p9);
-    SORT(p1, p2);
-    SORT(p3, p4);
-    SORT(p5, p6);
-    SORT(p7, p8);
-    SORT(p9, p10);
-    SORT(p13, p14);
-    SORT(p12, p13);
-    SORT(p13, p14);
-    SORT(p16, p17);
-    SORT(p15, p16);
-    SORT(p16, p17);
-    SORT(p12, p15);
-    SORT(p14, p17);
-    SORT(p14, p15);
-    SORT(p13, p16);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p19, p20);
-    SORT(p18, p19);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p21, p23);
-    SORT(p22, p24);
-    SORT(p22, p23);
-    SORT(p18, p21);
-    SORT(p20, p23);
-    SORT(p20, p21);
-    SORT(p19, p22);
-    SORT(p22, p24);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p12, p18);
-    SORT(p16, p22);
-    SORT(p16, p18);
-    SORT(p14, p20);
-    SORT(p20, p24);
-    SORT(p14, p16);
-    SORT(p18, p20);
-    SORT(p22, p24);
-    SORT(p13, p19);
-    SORT(p17, p23);
-    SORT(p17, p19);
-    SORT(p15, p21);
-    SORT(p15, p17);
-    SORT(p19, p21);
-    SORT(p13, p14);
-    SORT(p15, p16);
-    SORT(p17, p18);
-    SORT(p19, p20);
-    SORT(p21, p22);
-    SORT(p23, p24);
-    SORT(p0, p12);
-    SORT(p8, p20);
-    SORT(p8, p12);
-    SORT(p4, p16);
-    SORT(p16, p24);
-    SORT(p12, p16);
-    SORT(p2, p14);
-    SORT(p10, p22);
-    SORT(p10, p14);
-    SORT(p6, p18);
-    SORT(p6, p10);
-    SORT(p10, p12);
-    SORT(p1, p13);
-    SORT(p9, p21);
-    SORT(p9, p13);
-    SORT(p5, p17);
-    SORT(p13, p17);
-    SORT(p3, p15);
-    SORT(p11, p23);
-    SORT(p11, p15);
-    SORT(p7, p19);
-    SORT(p7, p11);
-    SORT(p11, p13);
-    SORT(p11, p12);
-    return p12;
-}
-
-/** This function applies a non linear filter on a 5x5 box basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_box5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
-    uchar8  out = row_reduce_min_5(tmp);
-#elif defined(MAX)
-    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
-    uchar8  out = row_reduce_max_5(tmp);
-#elif defined(MEDIAN)
-    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 cross basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_cross5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar8  top2    = vload8(0, offset(&src, 0, -2));
-    uchar8  top     = vload8(0, offset(&src, 0, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar8  bottom  = vload8(0, offset(&src, 0, 1));
-    uchar8  bottom2 = vload8(0, offset(&src, 0, 2));
-
-    // Apply respective filter
-#ifdef MIN
-    uchar8 tmp_middle = row_reduce_min_5(middle);
-    uchar8 out        = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
-#elif defined(MAX)
-    uchar8  tmp_middle = row_reduce_max_5(middle);
-    uchar8  out        = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
-#elif defined(MEDIAN)
-    uchar8 p0  = top2;
-    uchar8 p1  = top;
-    uchar8 p2  = middle.s01234567;
-    uchar8 p3  = middle.s12345678;
-    uchar8 p4  = middle.s23456789;
-    uchar8 p5  = middle.s3456789A;
-    uchar8 p6  = middle.s456789AB;
-    uchar8 p7  = bottom;
-    uchar8 p8  = bottom2;
-    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
-
-/** This function applies a non linear filter on a 5x5 disk basis on an input image.
- *
- * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_linear_filter_disk5x5(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values
-    uchar16 top2    = vload16(0, offset(&src, -2, -2));
-    uchar16 top     = vload16(0, offset(&src, -2, -1));
-    uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
-
-    // Shift top2 and bottom2 values
-    top2    = top2.s123456789ABCDEFF;
-    bottom2 = bottom2.s123456789ABCDEFF;
-
-    // Apply respective filter
-#ifdef MIN
-    uchar16 tmp_3     = min(top2, bottom2);
-    uchar16 tmp_5     = min(middle, min(top, bottom));
-    uchar8  tmp_3_red = row_reduce_min_3(tmp_3);
-    uchar8  tmp_5_red = row_reduce_min_5(tmp_5);
-    uchar8  out       = min(tmp_3_red, tmp_5_red);
-#elif defined(MAX)
-    uchar16 tmp_3      = max(top2, bottom2);
-    uchar16 tmp_5      = max(middle, max(top, bottom));
-    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
-    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
-    uchar8  out        = max(tmp_3_red, tmp_5_red);
-#elif defined(MEDIAN)
-    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
-#else /* MIN or MAX or MEDIAN */
-#error "Unsupported filter function"
-#endif /* MIN or MAX or MEDIAN */
-
-    // Store result
-    vstore8(out, 0, dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
deleted file mode 100644
index 3fcfad46f5..0000000000
--- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/** Sorts element-wise two vectors.
- *
- * @param[in, out] a First vector
- * @param[in, out] b Second vector
- */
-#define SORT(a, b)                  \
-    {                               \
-        uchar8 min_val = min(a, b); \
-        uchar8 max_val = max(a, b); \
-        a              = min_val;   \
-        b              = max_val;   \
-    }
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-
-/** Sorting network to sort 5 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
-{
-    SORT(p0, p1);
-    SORT(p2, p3);
-    SORT(p0, p2);
-    SORT(p1, p3);
-    SORT(p1, p2);
-    SORT(p0, p4);
-    SORT(p1, p4);
-    SORT(p2, p4);
-
-    return p2;
-}
-
-/** Sorting network to sort 9 vectors of 8 elements and return their median.
- *
- * @param[in] p0 First element vector
- * @param[in] p1 Second element vector
- * @param[in] p2 Third element vector
- * @param[in] p3 Fourth element vector
- * @param[in] p4 Fifth element vector
- * @param[in] p5 Sixth element vector
- * @param[in] p6 Seventh element vector
- * @param[in] p7 Eigth element vector
- * @param[in] p8 Ninth element vector
- *
- * @return Median values for 8 elements.
- */
-inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
-{
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p1);
-    SORT(p3, p4);
-    SORT(p6, p7);
-    SORT(p1, p2);
-    SORT(p4, p5);
-    SORT(p7, p8);
-    SORT(p0, p3);
-    SORT(p5, p8);
-    SORT(p4, p7);
-    SORT(p3, p6);
-    SORT(p1, p4);
-    SORT(p2, p5);
-    SORT(p4, p7);
-    SORT(p4, p2);
-    SORT(p6, p4);
-    SORT(p4, p2);
-
-    return p4;
-}
-
-/** Calculate the minimum of a sliding window of size 3.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_min_3(uchar16 val)
-{
-    return min(val.s01234567, min(val.s12345678, val.s23456789));
-}
-
-/** Calculate the maximum of a sliding window of size 3.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 3.
- */
-inline uchar8 row_reduce_max_3(uchar16 val)
-{
-    return max(val.s01234567, max(val.s12345678, val.s23456789));
-}
-
-/** Calculate the minimum of a sliding window of size 5.
- *
- * @param val Values to calculate the minimum values
- *
- * @return Minimum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_min_5(uchar16 val)
-{
-    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
-}
-
-/** Calculate the maximum of a sliding window of size 5.
- *
- * @param val Values to calculate the maximum values
- *
- * @return Maximum values of 8 elements on a sliding window of size 5.
- */
-inline uchar8 row_reduce_max_5(uchar16 val)
-{
-    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
-}
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
deleted file mode 100644
index 9bbde1a57f..0000000000
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "types.h"
-
-/*
- *The criteria for lost tracking is that the spatial gradient matrix has:
- * - Determinant less than DETERMINANT_THR
- * - or minimum eigenvalue is smaller then EIGENVALUE_THR
- *
- * The thresholds for the determinant and the minimum eigenvalue is
- * defined by the OpenVX spec
- *
- * Note: Also lost tracking happens when the point tracked coordinate is outside
- * the image coordinates
- *
- * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
- */
-
-/* Internal Lucas-Kanade Keypoint struct */
-typedef struct InternalKeypoint
-{
-    float x;               /**< The x coordinate. */
-    float y;               /**< The y coordinate. */
-    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
-    float dummy;           /**< Dummy member for alignment. */
-} InternalKeypoint;
-
-/** Threshold for the determinant. Used for lost tracking criteria */
-#define DETERMINANT_THR 1.0e-07f
-
-/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
-#define EIGENVALUE_THR 1.0e-04f
-
-/** Constants used for Lucas-Kanade Algorithm */
-#define W_BITS (14)
-#define FLT_SCALE (1.0f / (float)(1 << 20))
-#define D0 ((float)(1 << W_BITS))
-#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
-
-/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
- *
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level(
-    __global float4 *old_points_internal,
-    __global float4 *new_points_internal,
-    const float      scale)
-{
-    int idx = get_global_id(0);
-
-    // Get old and new keypoints
-    float4 old_point = old_points_internal[idx];
-    float4 new_point = new_points_internal[idx];
-
-    // Scale accordingly with the pyramid_scale
-    old_point.xy *= (float2)(2.0f);
-    new_point.xy *= (float2)(2.0f);
-
-    old_points_internal[idx] = old_point;
-    new_points_internal[idx] = new_point;
-}
-
-/** Initializes the internal new points array when the level of pyramid is equal to max.
- *
- * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale               Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max(
-    __global Keypoint *old_points,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint old_point = old_points[idx];
-
-    // Get old keypoint to track
-    InternalKeypoint old_point_internal;
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = old_point_internal;
-}
-
-/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
- *
- * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
- * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
- * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
- * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
- * @param[in]     scale                Scale factor to apply for the new_point coordinates.
- */
-__kernel void init_level_max_initial_estimate(
-    __global Keypoint *old_points,
-    __global Keypoint *new_points_estimates,
-    __global InternalKeypoint *old_points_internal,
-    __global InternalKeypoint *new_points_internal,
-    const float                scale)
-{
-    int idx = get_global_id(0);
-
-    Keypoint         old_point          = old_points[idx];
-    Keypoint         new_point_estimate = new_points_estimates[idx];
-    InternalKeypoint old_point_internal;
-    InternalKeypoint new_point_internal;
-
-    // Get old keypoint to track
-    old_point_internal.x               = old_point.x * scale;
-    old_point_internal.y               = old_point.y * scale;
-    old_point_internal.tracking_status = 1.f;
-
-    // Get new keypoint to track
-    new_point_internal.x               = new_point_estimate.x * scale;
-    new_point_internal.y               = new_point_estimate.y * scale;
-    new_point_internal.tracking_status = new_point_estimate.tracking_status;
-
-    // Store internal keypoints
-    old_points_internal[idx] = old_point_internal;
-    new_points_internal[idx] = new_point_internal;
-}
-
-/** Truncates the coordinates stored in new_points array
- *
- * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
- * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
- */
-__kernel void finalize(
-    __global InternalKeypoint *new_points_internal,
-    __global Keypoint *new_points)
-{
-    int idx = get_global_id(0);
-
-    // Load internal keypoint
-    InternalKeypoint new_point_internal = new_points_internal[idx];
-
-    // Calculate output point
-    Keypoint new_point;
-    new_point.x               = round(new_point_internal.x);
-    new_point.y               = round(new_point_internal.y);
-    new_point.strength        = 0.f;
-    new_point.scale           = 0.f;
-    new_point.orientation     = 0.f;
-    new_point.tracking_status = new_point_internal.tracking_status;
-    new_point.error           = 0.f;
-
-    // Store new point
-    new_points[idx] = new_point;
-}
-
-/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
- *
- * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
- * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
- * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
- * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
- * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
- * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
- * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
- * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
- * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
- * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
- * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
- * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
- * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
- * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
- * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
- */
-void __kernel lktracker_stage0(
-    IMAGE_DECLARATION(old_image),
-    IMAGE_DECLARATION(old_scharr_gx),
-    IMAGE_DECLARATION(old_scharr_gy),
-    __global float4 *old_points,
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0)
-{
-    int idx = get_global_id(0);
-
-    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
-    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
-    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
-
-    // Get old keypoint
-    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
-
-    // Get the floor value
-    float2 iold_keypoint = floor(old_keypoint);
-
-    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0.0f;
-        }
-
-        // Not valid coordinate. It sets min_eig to 0.0f
-        coeff[idx].s3 = 0.0f;
-
-        return;
-    }
-
-    // Compute weight for the bilinear interpolation
-    float2 ab = old_keypoint - iold_keypoint;
-
-    // Weight used for Bilinear-Interpolation on Scharr images
-    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
-    // w_scharr.s1 = ab.x * (1.0f - ab.y)
-    // w_scharr.s2 = (1.0f - ab.x) * ab.y
-    // w_scharr.s3 = ab.x * ab.y
-
-    float4 w_scharr;
-    w_scharr.s3  = ab.x * ab.y;
-    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
-    w_scharr.s12 = ab - (float2)w_scharr.s3;
-
-    // Weight used for Bilinear-Interpolation on Old and New images
-    // w.s0 = round(w_scharr.s0 * D0)
-    // w.s1 = round(w_scharr.s1 * D0)
-    // w.s2 = round(w_scharr.s2 * D0)
-    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-    float4 w;
-    w    = round(w_scharr * (float4)D0);
-    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    int4 iG = (int4)0;
-
-    // Window offset
-    int window_offset = idx * window_dimension_pow2;
-
-    // Compute Spatial Gradient Matrix G
-    for(ushort ky = 0; ky < window_dimension; ++ky)
-    {
-        int offset_y = iold_keypoint.y + ky;
-        for(ushort kx = 0; kx < window_dimension; ++kx)
-        {
-            int    offset_x = iold_keypoint.x + kx;
-            float4 px;
-
-            // Load values from old_image for computing the bilinear interpolation
-            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
-                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
-
-            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
-            float4 old_i;
-
-            // Compute bilinear interpolation (with D1 scale factor) for ival
-            old_i.s0 = dot(px, w) * D1;
-
-            // Load values from old_scharr_gx for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for ixval
-            old_i.s1 = dot(px, w_scharr);
-
-            // Load values from old_scharr_gy for computing the bilinear interpolation
-            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
-                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
-
-            // Compute bilinear interpolation for iyval
-            old_i.s2 = dot(px, w_scharr);
-
-            // Rounding (it could be omitted. Used just for matching the VX implementation)
-            int4 iold = convert_int4(round(old_i));
-
-            // Accumulate values in the Spatial Gradient Matrix
-            iG.s0 += (int)(iold.s1 * iold.s1);
-            iG.s1 += (int)(iold.s1 * iold.s2);
-            iG.s2 += (int)(iold.s2 * iold.s2);
-
-            // Store ival, ixval and iyval
-            iold_val[window_offset + kx] = convert_short4(iold);
-        }
-        window_offset += window_dimension;
-    }
-
-    // Scale iA11, iA12 and iA22
-    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
-
-    // Compute minimum eigen value
-    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
-
-    // Store A11. A11, A22 and min_eig
-    coeff[idx] = G;
-}
-
-/** Computes the motion vector for a given keypoint
- *
- * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
- * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
- * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
- * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
- * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
- * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
- * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
- * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
- * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
- * @param[in]      half_window                             The half size of the window on which to perform the algorithm
- * @param[in]      num_iterations                          The maximum number of iterations
- * @param[in]      epsilon                                 The value for terminating the algorithm.
- * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
- * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
- * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
- */
-void __kernel lktracker_stage1(
-    IMAGE_DECLARATION(new_image),
-    __global float4 *new_points,
-    __global float4 *coeff,
-    __global short4 *iold_val,
-    const int        window_dimension,
-    const int        window_dimension_pow2,
-    const int        half_window,
-    const int        num_iterations,
-    const float      epsilon,
-    const float3     border_limits,
-    const float      eig_const,
-    const int        level0,
-    const int        term_epsilon)
-{
-    int   idx       = get_global_id(0);
-    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
-
-    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
-    float4 G = coeff[idx];
-
-    // Determinant
-    float D = G.s0 * G.s2 - G.s1 * G.s1;
-
-    // Check if it is a good point to track
-    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
-    {
-        if(level0 == 1)
-        {
-            // Invalidate tracked point as we are at level 0
-            new_points[idx].s2 = 0;
-        }
-
-        return;
-    }
-
-    // Compute inverse
-    //D = native_recip(D);
-    D = 1.0 / D;
-
-    // Get new keypoint
-    float2 new_keypoint = new_points[idx].xy - (float)half_window;
-
-    // Get new point
-    float2 out_new_point = new_points[idx].xy;
-
-    // Keep delta obtained in the previous iteration
-    float2 prev_delta = (float2)0.0f;
-
-    int j = 0;
-    while(j < num_iterations)
-    {
-        // Get the floor value
-        float2 inew_keypoint = floor(new_keypoint);
-
-        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
-        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
-        {
-            if(level0 == 1)
-            {
-                // Invalidate tracked point as we are at level 0
-                new_points[idx].s2 = 0.0f;
-            }
-            else
-            {
-                new_points[idx].xy = out_new_point;
-            }
-
-            return;
-        }
-
-        // Compute weight for the bilinear interpolation
-        float2 ab = new_keypoint - inew_keypoint;
-
-        // Weight used for Bilinear-Interpolation on Old and New images
-        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
-        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
-        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
-        // w.s3 = D0 - w.s0 - w.s1 - w.s2
-
-        float4 w;
-        w.s3  = ab.x * ab.y;
-        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
-        w.s12 = ab - (float2)w.s3;
-        w     = round(w * (float4)D0);
-        w.s3  = D0 - w.s0 - w.s1 - w.s2;
-
-        // Mismatch vector
-        int2 ib = 0;
-
-        // Old val offset
-        int old_val_offset = idx * window_dimension_pow2;
-
-        for(int ky = 0; ky < window_dimension; ++ky)
-        {
-            for(int kx = 0; kx < window_dimension; ++kx)
-            {
-                // ival, ixval and iyval have been computed in the previous stage
-                int4 old_ival = convert_int4(iold_val[old_val_offset]);
-
-                // Load values from old_image for computing the bilinear interpolation
-                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
-                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
-
-                // Compute bilinear interpolation on new image
-                int jval = (int)round(dot(px, w) * D1);
-
-                // Compute luminance difference
-                int diff = (int)(jval - old_ival.s0);
-
-                // Accumulate values in mismatch vector
-                ib += (diff * old_ival.s12);
-
-                // Update old val offset
-                old_val_offset++;
-            }
-        }
-
-        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
-
-        // Optical Flow
-        float2 delta;
-
-        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
-        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
-
-        // Update new point coordinate
-        new_keypoint += delta;
-
-        out_new_point = new_keypoint + (float2)half_window;
-
-        if(term_epsilon == 1)
-        {
-            float mag2 = dot(delta, delta);
-
-            if(mag2 <= epsilon)
-            {
-                new_points[idx].xy = out_new_point;
-
-                return;
-            }
-        }
-
-        // Check convergence analyzing the previous delta
-        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
-        {
-            out_new_point -= delta * (float2)0.5f;
-
-            new_points[idx].xy = out_new_point;
-
-            return;
-        }
-
-        // Update previous delta
-        prev_delta = delta;
-
-        j++;
-    }
-
-    new_points[idx].xy = out_new_point;
-}
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
deleted file mode 100644
index 00250a08a5..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "repeat.h"
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if STRIDE_X == 1
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
-#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
-#endif /* STRIDE_X == 3 */
-
-#if defined(FP_MIXED_PRECISION)
-#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
-    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
-
-#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
-        data00 = POW2_OP(data00, 4);                                                                                                                 \
-        data01 = POW2_OP(data01, 2);                                                                                                                 \
-        data10 = POW2_OP(data10, 4);                                                                                                                 \
-        data11 = POW2_OP(data11, 2);                                                                                                                 \
-        data20 = POW2_OP(data20, 4);                                                                                                                 \
-        data21 = POW2_OP(data21, 2);                                                                                                                 \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
-        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
-        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
-        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
-        data00               = POW2_OP(data00, 8);                                                                                                   \
-        data01               = POW2_OP(data01, 1);                                                                                                   \
-        data10               = POW2_OP(data10, 8);                                                                                                   \
-        data11               = POW2_OP(data11, 1);                                                                                                   \
-        data20               = POW2_OP(data20, 8);                                                                                                   \
-        data21               = POW2_OP(data21, 1);                                                                                                   \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
-    ({                                                                                                                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
-        data00 = POW2_OP(data00, 8);                                                                                                         \
-        data01 = POW2_OP(data01, 4);                                                                                                         \
-        data10 = POW2_OP(data10, 8);                                                                                                         \
-        data11 = POW2_OP(data11, 4);                                                                                                         \
-        data20 = POW2_OP(data20, 8);                                                                                                         \
-        data21 = POW2_OP(data21, 4);                                                                                                         \
-        \
-        data00 = POOL_OP(data00, data10);                                                                                                    \
-        data01 = POOL_OP(data01, data11);                                                                                                    \
-        data00 = POOL_OP(data00, data20);                                                                                                    \
-        data01 = POOL_OP(data01, data21);                                                                                                    \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
-    })
-
-ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 2);
-    data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average or l2 pooling
-    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-/** Performs a pooling function of pool size equal to 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 3);
-    data1 = POW2_OP(data1, 3);
-    data2 = POW2_OP(data2, 3);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    data0             = POOL_OP(data0, data2);
-    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-#if defined(POOLING3x3)
-
-#define CONVERT_OP(data_type) convert_##data_type##4
-#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
-
-VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
-    int        start_y = get_global_id(1) * stride_y - pad_y;
-    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
-    const int  end_y   = min(start_y + pool_size, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max((int4)0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
-}
-
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_optimized_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    res;
-
-    // Perform pooling 3x3 for 4 output elements
-    POOLING3x3(res, input, output);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(POOLING3x3)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-/** Performs a pooling function of pool size equal to N  (NCHW)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata               = INITIAL_VALUE;
-    ACC_DATA_TYPE sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            sdata = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
-    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-    res               = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
-    const int x = get_global_id(0) * STRIDE_X;
-    const int y = get_global_id(1) * STRIDE_Y;
-    const int z = get_global_id(2);
-
-    //x axis: width, y axis: height, z axis: component
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
-                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
-                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
-    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
-    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp32(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    float data0_max = POOL_OP(data0.s0, data0.s1);
-    float data1_max = POOL_OP(data1.s0, data1.s1);
-    float res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global float *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    half data0_max = POOL_OP(data0.s0, data0.s1);
-    half data1_max = POOL_OP(data1.s0, data1.s1);
-    half res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //DST_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
-    int idx_out_n = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
-                                                      offset_c +
-                                                      idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
-                                                        offset_c +
-                                                        idx_out_w * output_stride_y +
-                                                        idx_out_h * output_stride_z +
-                                                        idx_out_n * output_stride_w;
-
-#if ((defined(POOL_AVG) || defined(POOL_L2)))
-#if defined(EXCLUDE_PADDING)
-    int filter_size = 0;
-#else // defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(EXCLUDE_PADDING)
-#endif // ((defined(POOL_AVG) || defined(POOL_L2)))
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
-#if defined(FP_MIXED_PRECISION)
-            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
-            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif // defined(POOL_L2)
-            res0 = POOL_OP(res0, data0);
-
-#if ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
-            filter_size++;
-#endif // ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
-        }
-    }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-
-/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2x2_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output)
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-    ,
-    TENSOR4D_DECLARATION(indices)
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-)
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //SRC_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
-    int idx_out_n = 0;
-#endif // SRC_BATCH_SIZE != 1
-
-    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
-                                                      idx_out_c * sizeof(DATA_TYPE) +
-                                                      idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
-                                                        idx_out_c * sizeof(DATA_TYPE) +
-                                                        idx_out_w * output_stride_y +
-                                                        idx_out_h * output_stride_z +
-                                                        idx_out_n * output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
-
-    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-
-    int x0 = pool_x_s + idx_in_w;
-    int y0 = pool_y_s + idx_in_h;
-    int x1 = pool_x_e - 1 + idx_in_w;
-    int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
-
-#if defined(FP_MIXED_PRECISION)
-    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
-    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
-    data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
-    data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
-    data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if !defined(POOL_MAX)
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
-
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-#endif // !defined(POOL_MAX)
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 *= data0;
-    data1 *= data1;
-    data2 *= data2;
-    data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#if defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else // !defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
-#endif // defined(EXCLUDE_PADDING)
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-
-    // This part is used to return the index of the maximum value
-    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
-
-    // note: Batch dimension does not contribute in the offset contribution
-    VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
-
-    base_index += VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-
-    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
-
-    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes +
-                                                         idx_out_c * sizeof(uint) +
-                                                         idx_out_w * indices_stride_y +
-                                                         idx_out_h * indices_stride_z +
-                                                         idx_out_n * indices_stride_w;
-
-    // Store result
-    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
deleted file mode 100644
index 0f013c5127..0000000000
--- a/src/core/CL/cl_kernels/remap.cl
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-    map_coords += (float8)(0.5f);
-
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7b3b..cb2f4b0319 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -75,7 +75,9 @@
     P_X##_DEF(F, P_A, P_B, P_C);        \
     REPEAT_3_15(P_X, P_A, P_B, P_C)
 
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+    REPEAT_3_##P_NUM(P_OP, P_A, P_B,               \
+                     P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
 
 // Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
     P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
     REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
 
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+    REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C,               \
+                     P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
 
 // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL)   TYPE VAR##ID = VAL
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT)   TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+    REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID += (TYPE)VAL
 #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
 
 // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL)   VAR_A##ID += VAR_B##ID * VAL
 #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
 
 // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)     REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
 
 // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)     REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
 
 // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = max(VAR##ID, (TYPE)VAL)
 #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = min(VAR##ID, (TYPE)VAL)
 #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
@@ -182,6 +191,7 @@
         VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
         VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
     })
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 #endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
deleted file mode 100644
index a01ff89a4f..0000000000
--- a/src/core/CL/cl_kernels/scale.cl
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_nearest(const float2 coord, const float2 scale)
-{
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = in_x_coords * (float4)(scale.s0);
-    const float4 new_y       = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
-    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r           = (float2)(scale_x, scale_y);
-    float8       transformed = transform_nearest(get_current_coords(), r);
-#ifdef ALIGN_CORNERS
-    transformed = round(transformed);
-#endif // ALIGN_CORNERS
-    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
-    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    float new_x = get_global_id(1) * scale_x;
-    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    float new_x = (get_global_id(1) + 0.5f) * scale_x;
-    float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-#ifdef ALIGN_CORNERS
-    new_x = round(new_x);
-    new_y = round(new_y);
-#endif /* ALIGN_CORNERS */
-    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
-    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf      = floor(new_x);
-    const float new_yf      = floor(new_y);
-    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
-    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
-    float       clamped_x_  = clamped_x;
-    float       clamped_x1_ = clamped_x1;
-    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
-    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
-#endif /* BORDER_MODE_REPLICATE */
-
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
-    const float a  = new_x - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = new_y - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-#endif /* defined(DEPTH_OUT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
deleted file mode 100644
index 2aa7f185c6..0000000000
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
-    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf      = floor(new_x);
-    const float new_yf      = floor(new_y);
-    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
-    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
-    float       clamped_x_  = clamped_x;
-    float       clamped_x1_ = clamped_x1;
-    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
-    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
-#endif /* BORDER_MODE_REPLICATE */
-
-    int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
-    const float  a      = new_x - new_xf;
-    const float  b      = 1.f - a;
-    const float  a1     = new_y - new_yf;
-    const float  b1     = 1.f - a1;
-    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-
-    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-
-    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
-    *((__global DATA_TYPE *)out.ptr) = res;
-}
-#endif /* defined(DEPTH_OUT) */
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
deleted file mode 100644
index d2868b6731..0000000000
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This OpenCL kernel computes Scharr3x3.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void scharr3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-3);
-    gy += middle * (short8)(-10);
-    gy += right * (short8)(-3);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-10);
-    gx += right * (short8)(+10);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-3);
-    gx += right * (short8)(+3);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+3);
-    gy += middle * (short8)(+10);
-    gy += right * (short8)(+3);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
deleted file mode 100644
index 7983734fc4..0000000000
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/***********************************************/
-/*   Begin implementation of Sobel3x3 filter   */
-/***********************************************/
-
-/** This OpenCL kernel that computes a Sobel3x3 filter.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-1);
-    gy += middle * (short8)(-2);
-    gy += right * (short8)(-1);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-2);
-    gx += right * (short8)(+2);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+1);
-    gy += middle * (short8)(+2);
-    gy += right * (short8)(+1);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel3x3 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel5x5 filter   */
-/***********************************************/
-
-/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src             Pointer to source image.
- * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
- * @param[in] left2_coeff_gx  Weight of the left pixel for gx
- * @param[in] middle_coeff_gx Weight of the middle pixel for gx
- * @param[in] right1_coeff_gx Weight of the right pixel for gx
- * @param[in] right2_coeff_gx Weight of the most right pixel for gx
- * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
- * @param[in] left2_coeff_gy  Weight of the left pixel for gy
- * @param[in] middle_coeff_gy Weight of the middle pixel for gy
- * @param[in] right1_coeff_gy Weight of the right pixel for gy
- * @param[in] right2_coeff_gy Weight of the most right pixel for gy
- *
- * @return a short16 containing short8 gx and short8 gy values.
- */
-short16 sobel1x5(
-    Image      *src,
-    const short left1_coeff_gx,
-    const short left2_coeff_gx,
-    const short middle_coeff_gx,
-    const short right1_coeff_gx,
-    const short right2_coeff_gx,
-    const short left1_coeff_gy,
-    const short left2_coeff_gy,
-    const short middle_coeff_gy,
-    const short right1_coeff_gy,
-    const short right2_coeff_gy)
-{
-    uchar16 temp = vload16(0, offset(src, -2, 0));
-    short8  gx   = 0;
-    short8  gy   = 0;
-    short8  val;
-
-    val = convert_short8(temp.s01234567);
-    gx += val * (short8)left1_coeff_gx;
-    gy += val * (short8)left1_coeff_gy;
-
-    val = convert_short8(temp.s12345678);
-    gx += val * (short8)left2_coeff_gx;
-    gy += val * (short8)left2_coeff_gy;
-
-    val = convert_short8(temp.s23456789);
-    gx += val * (short8)middle_coeff_gx;
-    gy += val * (short8)middle_coeff_gy;
-
-    val = convert_short8(temp.s3456789a);
-    gx += val * (short8)right1_coeff_gx;
-    gy += val * (short8)right1_coeff_gy;
-
-    val = convert_short8(temp.s456789ab);
-    gx += val * (short8)right2_coeff_gx;
-    gy += val * (short8)right2_coeff_gy;
-
-    return (short16)(gx, gy);
-}
-
-/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-short8 sobel5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    short8 val;
-    short8 out = (short8)0;
-
-    val = vload8(0, (__global short *)offset(src, 0, -2));
-    out += val * (short8)up1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, -1));
-    out += val * (short8)up2_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 0));
-    out += val * (short8)middle_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 1));
-    out += val * (short8)down1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 2));
-    out += val * (short8)down2_coeff;
-
-    return (short8)(out);
-}
-
-/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x5(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-    short16 gx_gy = sobel1x5(&src,
-                             -1, -2, 0, 2, 1,
-                             1, 4, 6, 4, 1);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
- *  and output two single channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable5x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short8 gx = sobel5x1(&src_x,
-                         1, 4, 6, 4, 1);
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = sobel5x1(&src_y,
-                         -1, -2, 0, 2, 1);
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel5x5 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel7x7 filter   */
-/***********************************************/
-
-/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
-#define X0 -1
-#define X1 -4
-#define X2 -5
-#define X3 0
-#define X4 5
-#define X5 4
-#define X6 1
-
-/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
-#define Y0 1
-#define Y1 6
-#define Y2 15
-#define Y3 20
-#define Y4 15
-#define Y5 6
-#define Y6 1
-
-/* Calculates single horizontal iteration. */
-#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
-    {                                                                \
-        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
-        gx += val * X##idx;                                          \
-        gy += val * Y##idx;                                          \
-    }
-
-/* Calculates single vertical iteration. */
-#define SOBEL1x1_VERT(src, g, direction, idx)                          \
-    {                                                                  \
-        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
-        g += val * (int8)direction##idx;                               \
-    }
-
-/* Calculates a 1x7 horizontal iteration. */
-#define SOBEL1x7(ptr, gx, gy)                        \
-    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
-    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
-    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
-    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
-    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
-    SOBEL1x1_HOR(ptr, gx, gy, 5) \
-    SOBEL1x1_HOR(ptr, gx, gy, 6)
-
-/* Calculates a 7x1 vertical iteration. */
-#define SOBEL7x1(ptr, g, direction)                         \
-    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
-    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
-    SOBEL1x1_VERT(ptr, g, direction, 2)             \
-    SOBEL1x1_VERT(ptr, g, direction, 3)         \
-    SOBEL1x1_VERT(ptr, g, direction, 4)     \
-    SOBEL1x1_VERT(ptr, g, direction, 5) \
-    SOBEL1x1_VERT(ptr, g, direction, 6)
-
-/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x7(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-    int8 gx = (int8)0;
-    int8 gy = (int8)0;
-
-    SOBEL1x7(&src, gx, gy);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable7x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    int8 gx = 0;
-    SOBEL7x1(&src_x, gx, Y);
-    vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    int8 gy = 0;
-    SOBEL7x1(&src_y, gy, X);
-    vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel7x7 filter   */
-/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
deleted file mode 100644
index 01f5de47cf..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(LOG_SOFTMAX)
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-#else  // defined(LOG_SOFTMAX)
-    data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Initialize local maximum
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    data -= max_val;
-#ifdef BETA
-    data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
-    sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-
-    // Perform sum reduction
-    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Define one temporary vector per work-item.
-    __local VEC_TYPE tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    //TODO: Optimize this calculation (avoid %).
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    SELECT_TYPE widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = max_val_vec;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE  sum1D   = 0;
-    DATA_TYPE max_val = max_local;
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    //TODO: Optimize the calculation (avoid %).
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data = exp(data);
-        data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        data = select(0, data, widx);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
deleted file mode 100644
index b7a6e00dfa..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ /dev/null
@@ -1,532 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-
-#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
-
-    // It will be better to calculate this in prev layer and pass here as parameter
-    uint    sum_val_u               = convert_uint(sum_val);
-    int     headroom_plus_one       = clz(sum_val_u);
-    int     num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
-    int     shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
-    VEC_INT shifted_sum_minus_one   = shifted_sum_minus_one_1;
-    VEC_INT shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
-
-    // It was already calculated in prev layer, should be stored into tmp output and reused
-    VEC_INT data_diff      = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
-    VEC_INT data_diff_mult = data_diff;
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-
-    VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data         = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
-    data         = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
-#ifdef QASYMM8_SIGNED
-    data += (VEC_INT)(MIN_VALUE);
-#endif /* QASYMM8_SIGNED */
-    data           = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
-    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-
-VEC_INT mult_by_quantized_multiplier(VEC_INT data)
-{
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
-
-    // Calculate max of row
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_local;
-
-    // Second part
-
-    // Load max value of 1D logits vector (row)
-    int max_val = convert_int(max_local);
-
-    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
-    VEC_INT sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT data_fp        = CONVERT(data, VEC_INT);
-    VEC_INT data_diff      = data_fp - max_val;
-    VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-    data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data_diff, 0, (__global int *)dst_addr);
-    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    sum1D += select(0, data_fp, widx);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-
-    // Perform sum reduction
-    *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    // Define one temporary vector per work-item.
-    __local VEC_INT tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE max_val_vec = vec_min_val;
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    //TODO: Optimize this calculation (avoid %).
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_INT sum1D   = 0;
-    int     max_val = convert_int(max_local);
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    //TODO: Optimize the calculation (avoid %).
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-        data_fp = select(0, data_fp, widx);
-        sum1D   = sum1D + data_fp;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
deleted file mode 100644
index 0ef1648d94..0000000000
--- a/src/core/CL/cl_kernels/tablelookup.cl
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs table lookup on U8 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: U8
- */
-__kernel void tablelookup_U8(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global uchar *lut)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    uchar8 data = vload8(0, src.ptr);
-
-    /* Load lut data */
-    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
-                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
-
-    /* Store result */
-    vstore8(lut_data, 0, dst.ptr);
-}
-
-/** This function performs table lookup on S16 input/output images.
- *
- * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  lut                               LUT table. Supported data types: S16
- * @param[in]  offset                            LUT offset
- * @param[in]  count                             Number of elements in the LUT
- */
-__kernel void tablelookup_S16(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    __global short *lut,
-    uint            offset,
-    uint            count)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Load input data */
-    short8 data = vload8(0, (__global short *)src.ptr);
-
-    /* Load output data */
-    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
-
-    /* Calculate index */
-    int8 index = convert_int8(data) + (int8)(offset);
-    int8 cond  = (index >= 0 && index < (int8)count);
-    index      = select(0, index, cond);
-
-    /* Load lut data */
-    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
-                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
-
-    /* Select output data depending on condition */
-    lut_data = select(out_data, lut_data, cond);
-
-    /* Store result */
-    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
deleted file mode 100644
index ff3ac05ef4..0000000000
--- a/src/core/CL/cl_kernels/threshold.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform binary thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  threshold                         The thresold value
- */
-__kernel void threshold_binary(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar threshold)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform binary thresholding
-    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}
-
-/** Perform range thresholding on an image.
- *
- * @param[in]  in_ptr                            Pointer to the source image
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[out] out_ptr                           Pointer to the destination image
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  false_val                         False value
- * @param[in]  true_val                          True value
- * @param[in]  lower                             Lower threshold
- * @param[in]  upper                             Upper threshold
- */
-__kernel void threshold_range(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const uchar false_val,
-    const uchar true_val,
-    const uchar lower,
-    const uchar upper)
-{
-    // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-
-    // Load data
-    uchar16 in_data = vload16(0, in.ptr);
-
-    // Perform range thresholding
-    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
-
-    // Store result
-    vstore16(in_data, 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
new file mode 100644
index 0000000000..8129606277
--- /dev/null
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -0,0 +1,1451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
+#define ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
+
+// *INDENT-OFF*
+// clang-format off
+
+#define TILE_VECTOR_SIZE1 1
+#define TILE_VECTOR_SIZE2 2
+#define TILE_VECTOR_SIZE3 3
+#define TILE_VECTOR_SIZE4 4
+#define TILE_VECTOR_SIZE5 8
+#define TILE_VECTOR_SIZE6 8
+#define TILE_VECTOR_SIZE7 8
+#define TILE_VECTOR_SIZE8 8
+#define TILE_VECTOR_SIZE9 16
+#define TILE_VECTOR_SIZE10 16
+#define TILE_VECTOR_SIZE11 16
+#define TILE_VECTOR_SIZE12 16
+#define TILE_VECTOR_SIZE13 16
+#define TILE_VECTOR_SIZE14 16
+#define TILE_VECTOR_SIZE15 16
+#define TILE_VECTOR_SIZE16 16
+
+#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
+#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
+#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
+#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
+#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
+#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
+#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
+
+/** Tile object
+ *  A tile object is a 2D memory block and can be accessed using the following syntax:
+ *  -# a[m0].v    = access the the vector at row "m0" (OpenCL vector)
+ *  -# dst[m0].s[n0] = access the scalar element at row "m0" and column "n0" (scalar access)
+ *
+ * @param[in] DATA_TYPE Data type of the tile
+ * @param[in] H         Number of tile rows
+ * @param[in] W         Number of tile colums
+ * @param[in] BASENAME  Tile's name
+ */
+#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
+#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
+    union {                                 \
+        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
+        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
+    } BASENAME[H]
+
+#define TENSOR4D_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint            name##_stride_x,  \
+    uint            name##_step_x,    \
+    uint            name##_stride_y,  \
+    uint            name##_step_y,    \
+    uint            name##_stride_z,  \
+    uint            name##_step_z,    \
+    uint            name##_stride_w,  \
+    uint            name##_step_w,    \
+    uint            name##_offset_first_element_in_bytes
+
+#define TENSOR4D_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
+#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
+
+#define TENSOR4D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_stride_w, \
+    uint        name##_c,   \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+
+/** Legacy tensor 4D arguments
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
+
+#define TENSOR4D_RO_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
+
+/** Read-Only (RO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
+
+#define TENSOR4D_WO_T_IMAGE(name)          \
+    __write_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
+
+/** Write-Only (WO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
+
+#define TENSOR3D_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    __global uchar *name##_ptr,       \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_BUFFER(name)    \
+    __global uchar *name##_ptr,  \
+    uint        name##_stride_y, \
+    uint        name##_stride_z, \
+    uint        name##_w,   \
+    uint        name##_h,   \
+    uint        name##_n,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
+#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
+
+#if !defined(UNROLL_WITH_PRAGMA)
+#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
+
+#define LOOP_UNROLLING_1(idx, step, macro) (macro)
+#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
+#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
+
+#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
+    {                                                          \
+        type idx = start;                                      \
+        LOOP_UNROLLING_##num(idx, step, macro);                \
+    }
+#else // !defined(UNROLL_WITH_PRAGMA)
+#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
+    {                                                          \
+        _Pragma("unroll")                                      \
+        for(type idx = start; idx < (num * step); idx += step) \
+        {                                                      \
+            (macro);                                           \
+        }                                                      \
+    }
+#endif // !defined(UNROLL_WITH_PRAGMA)
+#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
+
+/** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0
+ *  to avoid out-of-bound read/write
+ *
+ * @note PARTIAL_N0 is used for get_global_id(n) = 0.
+ *
+ * @param[in] IDX        get_global_id index (0,1 and 2 only)
+ * @param[in] N0         Number of elements read/written on the IDX direction
+ * @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,
+ *                        the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0
+ */
+#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
+
+/** Dot product integet 8bit function
+ *
+ *  @note Performs: c += dot(a, b)
+ *
+ * @param[in] A_DATA_TYPE A (lhs) data type
+ * @param[in] B_DATA_TYPE B (rhs) data type
+ * @param[in] C_DATA_TYPE C (accumulator) data type
+ * @param[in] K0          Number of accumulations
+ * @param[in] a           OpenCL vector a
+ * @param[in] b           OpenCL vector b
+ * @param[in] c           Scalar variable c
+ */
+#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
+#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
+#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
+    })
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) //  defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
+    ({                                                  \
+        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
+        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
+    })
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
+    ({                                                  \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
+        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
+    })
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
+    ({                                                    \
+        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
+        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
+        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
+        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
+    })
+#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
+    })
+#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
+    })
+#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
+    })
+#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
+    })
+#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
+    })
+#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
+    })
+#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+    })
+#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sC), ((b).sC), c);     \
+    })
+#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sCD), ((b).sCD), c);     \
+    })
+#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
+        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
+        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).sCDE), ((b).sCDE), c);     \
+    })
+#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
+    ({                                                 \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
+        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
+    })
+
+/** Dot product integet 8bit function
+ *
+ *  @note Performs: c += dot(a, b)
+ *
+ * @param[in] A_DATA_TYPE A (lhs) data type
+ * @param[in] B_DATA_TYPE B (rhs) data type
+ * @param[in] C_DATA_TYPE C (accumulator) data type
+ * @param[in] K0          Number of accumulations
+ * @param[in] a           OpenCL vector a
+ * @param[in] c           Scalar variable c
+ */
+#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
+#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
+
+/** Load a vector from global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ */
+#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
+#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
+#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
+    VLOAD(WIDTH)                                                \
+    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
+#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
+
+/** Store a vector in global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ * @param[in] VALUES      Values to store in memory
+ */
+#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
+    VSTORE(WIDTH)                                                \
+    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
+#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
+
+/** Load a tile from global memory (tensor)
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  HEIGHT        Number of dst rows
+ * @param[in]  WIDTH         Number of dst columns
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  X             Starting X position
+ * @param[in]  Y             Starting Y position
+ * @param[in]  YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).
+ *                           In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).
+ *                           In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y
+ * @param[in]  STRIDE_Y      Stride Y (in bytes) used to load each row.
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
+    ({                                                                                                                 \
+        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
+        {                                                                                                              \
+            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
+        })                                                                                                             \
+    })
+
+/** Store a VECTOR variable (e.g. int4, int8, char2 etc.) to a specified column in the TILE object
+ *
+ * @param[in]      VECTOR Vector variable to store
+ * @param[in, out] TILE   Tile variable to store to
+ * @param[in]      WIDTH  Width of the vector variable, also height of the tile (e.g. 2 if char2)
+ * @param[in]      COLUMN Column index of the tile
+ */
+#define COPY_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, WIDTH, COLUMN) COPY_VECTOR_TO_TILE_COLUMN_STR(VECTOR, TILE, WIDTH, COLUMN)
+#define COPY_VECTOR_TO_TILE_COLUMN_STR(VECTOR, TILE, WIDTH, COLUMN) COPY_##WIDTH##_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN)
+#define COPY_1_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR;                         \
+    })
+
+#define COPY_2_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+    })
+
+#define COPY_3_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+    })
+
+#define COPY_4_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+    })
+
+#define COPY_8_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+        TILE[4].s[COLUMN] = VECTOR.s4;                      \
+        TILE[5].s[COLUMN] = VECTOR.s5;                      \
+        TILE[6].s[COLUMN] = VECTOR.s6;                      \
+        TILE[7].s[COLUMN] = VECTOR.s7;                      \
+    })
+
+#define COPY_16_VECTOR_TO_TILE_COLUMN(VECTOR, TILE, COLUMN) \
+    ({                                                      \
+        TILE[0].s[COLUMN] = VECTOR.s0;                      \
+        TILE[1].s[COLUMN] = VECTOR.s1;                      \
+        TILE[2].s[COLUMN] = VECTOR.s2;                      \
+        TILE[3].s[COLUMN] = VECTOR.s3;                      \
+        TILE[4].s[COLUMN] = VECTOR.s4;                      \
+        TILE[5].s[COLUMN] = VECTOR.s5;                      \
+        TILE[6].s[COLUMN] = VECTOR.s6;                      \
+        TILE[7].s[COLUMN] = VECTOR.s7;                      \
+        TILE[8].s[COLUMN] = VECTOR.s8;                      \
+        TILE[9].s[COLUMN] = VECTOR.s9;                      \
+        TILE[10].s[COLUMN] = VECTOR.sA;                     \
+        TILE[11].s[COLUMN] = VECTOR.sB;                     \
+        TILE[12].s[COLUMN] = VECTOR.sC;                     \
+        TILE[13].s[COLUMN] = VECTOR.sD;                     \
+        TILE[14].s[COLUMN] = VECTOR.sE;                     \
+        TILE[15].s[COLUMN] = VECTOR.sF;                     \
+    })
+
+/** Load SRC_HEIGHT x SRC_WIDTH elements from global memory (tensor), and store them in a SRC_WIDTH x SRC_HEIGHT tile
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  SRC_HEIGHT    Number of source rows, or number of columns of the output tile
+ * @param[in]  SRC_WIDTH     Number of source columns, or number of tile rows
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  X             Starting X position
+ * @param[in]  Y             Starting Y position
+ * @param[in]  YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).
+ *                           In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y.
+ *                           (e.g. loading the weights of convolution layer).
+ *                           In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y
+ * @param[in]  STRIDE_Y      Stride Y (in bytes) used to load each row.
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_TRANSPOSED(DATA_TYPE, SRC_HEIGHT, SRC_WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)     \
+    ({                                                                                                                   \
+        LOOP_UNROLLING(int, _i, 0, 1, SRC_HEIGHT,                                                                        \
+        {                                                                                                                \
+            VEC_DATA_TYPE(DATA_TYPE, SRC_WIDTH)                                                                          \
+                tmp = V_LOAD(DATA_TYPE, SRC_WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
+            COPY_VECTOR_TO_TILE_COLUMN(tmp, dst, SRC_WIDTH, _i);                                                         \
+        })                                                                                                               \
+    })
+
+/** Load a tile from global memory (tensor) using an indirect Y index tile
+ *
+ * @param[in]  DATA_TYPE   Data type
+ * @param[in]  HEIGHT      Number of dst rows
+ * @param[in]  WIDTH       Number of dst columns
+ * @param[in]  TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                         In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR      Tensor basename
+ * @param[in]  X           Starting X position
+ * @param[in]  STRIDE_Y    Stride Y (in bytes)
+ * @param[in]  indirect_y  Indirect Y index tile
+ * @param[out] dst         Output tile
+ */
+#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
+    ({                                                                                                  \
+        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
+        {                                                                                               \
+            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
+        })                                                                                              \
+    })
+
+/** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load
+ *
+ * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store
+ * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
+ *
+ * @param[in]  DATA_TYPE        Data type
+ * @param[in]  HEIGHT           Number of dst rows
+ * @param[in]  WIDTH0           Store width to use if WIDTH1_CONDITION = false
+ * @param[in]  WIDTH1           Store width to use if WIDTH1_CONDITION = true
+ * @param[in]  TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                              In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR           Tensor basename
+ * @param[in]  X                Starting X position
+ * @param[in]  STRIDE_Y         Stride Y (in bytes) used to load each row.
+ * @param[in]  WIDTH1_CONDITION Condition to select the WIDTH1 store
+ * @param[out] dst              Output tile
+ * @param[out] indirect_y       Indirect Y index tile
+ */
+#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
+    ({                                                                                                                                                                                             \
+        if(WIDTH1_CONDITION)                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
+                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+        else                                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+    })
+/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_HEIGHT   Number of elements to load from Y (height) dimension
+ * @param[in]  TILE_WIDTH    Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  B             Starting batch index
+ * @param[in]  Y             Starting Y index
+ * @param[in]  X             Starting X index
+ * @param[in]  C             Starting C index
+ * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
+    ({                                                                                                                                                \
+        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
+        {                                                                                                                                             \
+            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
+            {                                                                                                                                         \
+                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
+                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
+                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
+                if(_src_valid_y != 0)                                                                                                                 \
+                {                                                                                                                                     \
+                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
+                }                                                                                                                                     \
+            })                                                                                                                                        \
+        })                                                                                                                                            \
+    })
+
+/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments
+ *
+ * @param[in]  DATA_TYPE      Data type
+ * @param[in]  TILE_HEIGHT    Number of elements to load from Y (height) dimension
+ * @param[in]  TILE_WIDTH     Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS  Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE    Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                            In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR         Tensor basename
+ * @param[in]  B              Starting batch index
+ * @param[in]  Y              Starting Y index
+ * @param[in]  X              Starting X index
+ * @param[in]  C              Starting C index
+ * @param[in]  TENSOR_HEIGHT  Number of elements to load from Y (height) dimension
+ * @param[in]  TENSOR_WIDTH   Number of elements to load from X (width) dimension
+ * @param[in]  DILATION_X     Dilation for the X increment
+ * @param[in]  DILATION_Y     Dilation for the Y increment
+ * @param[in]  BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads
+ * @param[out] dst            Output tile
+ */
+#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
+    ({ \
+        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
+        { \
+            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
+            { \
+                int _src_y = (X) + _xk * (DILATION_X); \
+                int _src_z = ((Y) + _yk * (DILATION_Y)); \
+                int _src_w    = (B); \
+                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
+                if(!(BOUNDARY_CHECK)) \
+                { \
+                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
+                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
+                } \
+                else \
+                { \
+                    if(_src_valid_y) \
+                    { \
+                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
+                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
+                    }                                                                                                                                                                                                 \
+                } \
+            })                                                                                                                                                                                                             \
+        })                                                                                                                                                                                                             \
+    })
+
+/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  B             Starting batch index
+ * @param[in]  Y             Starting Y index
+ * @param[in]  X             Starting X index
+ * @param[in]  C             Starting C index
+ * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
+ * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
+ * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
+    ({                                                                                                                                                                \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
+        {                                                                                                                                                             \
+            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
+            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
+            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
+            if(_src_valid_y != 0)                                                                                                                                     \
+            {                                                                                                                                                         \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
+            }                                                                                                                                                         \
+        })                                                                                                                                                            \
+    })
+
+/** Load a tile from global memory (tensor) using an indirect buffer for the Y coordinates
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  C             Starting C index
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
+ *                           16 is the maximum indirect buffer size.
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            if(yi[0].s[_i] >= 0) \
+            { \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+            } \
+        }) \
+    })
+
+#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+        }) \
+    })
+
+/** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
+ *
+ * @param[in]  DATA_TYPE     Data type
+ * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
+ * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR        Tensor basename
+ * @param[in]  B             Starting batch index
+ * @param[in]  Z             Starting Z index
+ * @param[in]  Y             Starting Y index
+ * @param[in]  X             Starting X index
+ * @param[in]  C             Starting C index
+ * @param[in]  TENSOR_WIDTH  Number of elements to load from X (width) dimension
+ * @param[in]  TENSOR_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in]  TENSOR_DEPTH  Number of elements to load from Z (depth) dimension
+ * @param[in]  STRIDE_Y      Stride Y (in bytes)
+ * @param[out] xi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
+ * @param[out] yi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
+ * @param[out] zi            A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate
+ * @param[out] dst           Output tile
+ */
+#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
+    ({                                                                                                                                                                \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
+        {                                                                                                                                                             \
+            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
+            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
+            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
+                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
+            if(_src_valid_y != 0)                                                                                                                                     \
+            {                                                                                                                                                         \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
+            }                                                                                                                                                         \
+        })                                                                                                                                                            \
+    })
+
+/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
+ *
+ * @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store
+ * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
+ *
+ * @param[in] DATA_TYPE        Data type
+ * @param[in] HEIGHT           Number of src rows
+ * @param[in] WIDTH0           Store width to use if WIDTH1_CONDITION = false
+ * @param[in] WIDTH1           Store width to use if WIDTH1_CONDITION = true
+ * @param[in] TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                             cl_image is not supported.
+ * @param[in] TENSOR           Tensor basename
+ * @param[in] X                Starting X position
+ * @param[in] STRIDE_Y         Stride Y (in bytes)
+ * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store
+ * @param[in] src              Input tile
+ * @param[in] indirect_y       Indirect Y index tile
+ */
+#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
+    ({                                                                                                                                                                                             \
+        if(WIDTH1_CONDITION)                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
+                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+        else                                                                                                                                                                                       \
+        {                                                                                                                                                                                          \
+            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
+            {                                                                                                                                                                                      \
+                VSTORE(WIDTH0)                                                                                                                                                                     \
+                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+            })                                                                                                                                                                                     \
+        }                                                                                                                                                                                          \
+    })
+
+/** Offset correction for the QASYMM8 computation
+ *
+ * @param[in]  ACC_DATA_TYPE Accumulator data type
+ * @param[in]  M0            Number of src/dst rows
+ * @param[in]  N0            Number of src/dst columns
+ * @param[in]  K0            Number of src columns
+ * @param[in]  SRC_OFFSET    Source quantization offset
+ * @param[in]  WEI_OFFSET    Weights quantization shift
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
+    ({                                                                                               \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
+        {                                                                                            \
+            ACC_DATA_TYPE _tm = 0;                                                                   \
+            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
+            {                                                                                        \
+                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
+            })                                                                                       \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
+            {                                                                                        \
+                dst[_m0].s[_n0] += _tm;                                                              \
+                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
+                {                                                                                    \
+                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
+                })                                                                                   \
+            })                                                                                       \
+        })                                                                                          \
+    })
+
+/** 8-bit quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE     SRC data type
+ * @param[in]  DST_DATA_TYPE     DST data type
+ * @param[in]  QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)
+ * @param[in]  M0                Number of src/dst rows
+ * @param[in]  N0                Number of src/dst columns
+ * @param[in]  DST_OFFSET        Quantization offset used for both the per-tensor and per-channel quantization
+ * @param[in]  DST_SHIFT         Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER    Quantization multiplier for the per-tensor quantization
+ * @param[in]  src               Input tile
+ * @param[in]  dst_multipliers   Output multipliers tile for the per-channel quantization
+ * @param[in]  dst_shifts        Output shift tile for the per-channel quantization
+ * @param[out] dst               Output tile
+ */
+#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
+#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
+
+/** 8-bit per-tensor quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE   SRC data type
+ * @param[in]  DST_DATA_TYPE   DST data type
+ * @param[in]  M0              Number of src/dst rows
+ * @param[in]  N0              Number of src/dst columns
+ * @param[in]  DST_OFFSET      Quantization offset
+ * @param[in]  DST_SHIFT       Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER  Quantization multiplier for the per-tensor quantization
+ * @param[in]  src             Input tile
+ * @param[in]  dst_multipliers (unused)
+ * @param[in]  dst_shifts      (unused)
+ * @param[out] dst             Output tile
+ */
+#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(DST_MULTIPLIER); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                if(DST_SHIFT >= 0) \
+                { \
+                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
+                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
+                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
+                } \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                          \
+    })
+
+/** 8-bit per-channel quantization with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE   SRC data type
+ * @param[in]  DST_DATA_TYPE   DST data type
+ * @param[in]  M0              Number of src/dst rows
+ * @param[in]  N0              Number of src/dst columns
+ * @param[in]  DST_OFFSET      Quantization offset
+ * @param[in]  DST_SHIFT       (unused)
+ * @param[in]  DST_MULTIPLIER  (unused)
+ * @param[in]  src             Input tile
+ * @param[in]  dst_multipliers Output multipliers tile for the per-channel quantization
+ * @param[in]  dst_shifts      Output shift tile for the per-channel quantization
+ * @param[out] dst             Output tile
+ */
+#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _tmp2 = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
+                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(_dst_multiplier); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                long mask = ((((int)1) << _dst_shift) - (int)1); \
+                long threshold = (mask >> 1) + any(_tmp); \
+                _tmp2 = _tmp >> _dst_shift; \
+                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
+                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                         \
+    })
+
+/** Quantized the 8-bit tile with fixed-point scale for asymmetric
+ *
+ * @param[in]  SRC_DATA_TYPE  SRC data type
+ * @param[in]  DST_DATA_TYPE  DST data type
+ * @param[in]  M0             Number of src/dst rows
+ * @param[in]  N0             Number of src/dst columns
+ * @param[in]  DST_OFFSET     Quantization offset used for both the per-tensor and per-channel quantization
+ * @param[in]  DST_SHIFT      Quantization shift for the per-tensor quantization
+ * @param[in]  DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
+ * @param[in]  src            Input tile
+ * @param[out] dst            Output tile
+ */
+#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
+                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
+                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
+                long a_64 = (long)(_src); \
+                long b_64 = (long)(DST_MULTIPLIER); \
+                long ab_64 = a_64 * b_64; \
+                long mask1 = 1 << 30; \
+                long mask2 = 1 - (1 << 30); \
+                long is_positive_or_zero = ab_64 >= 0; \
+                long nudge = select(mask2, mask1, is_positive_or_zero); \
+                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
+                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
+                if(DST_SHIFT >= 0) \
+                { \
+                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
+                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
+                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
+                } \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            })                                                                                                                                          \
+        })                                                                                                                                          \
+    })
+
+/** Conditional rowset (memset by row)
+ *
+ * @note Set the row to VALUE_TO_SET if the corresponding mask == 0
+ *
+ * @param[in]      DATA_TYPE    Data type
+ * @param[in]      M0           Number of LHS rows
+ * @param[in]      N0           Number of LHS columns
+ * @param[in]      VALUE_TO_SET Value to set the row
+ * @param[in, out] a            Input/output tile
+ * @param[out]     mask         Mask to check for setting the row to VALUE_TO_SET
+ */
+#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
+    ({                                                                                                                                                     \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
+        {                                                                                                                                                  \
+            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
+            {                                                                                                                                              \
+                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
+            })                                                                                                                                             \
+        })                                                                                                                                                 \
+    })
+
+/** Element-wise activation for floating point types
+ *
+ * @note Performs: activation(LHS) = DST
+ *
+ * @param[in]  DATA_TYPE       SRC/DST data type
+ * @param[in]  M0              Number of SRC/DST rows
+ * @param[in]  N0              Number of SRC/DST columns
+ * @param[in]  ACTIVATION_TYPE Activation type
+ * @param[in]  A_VAL           A value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[in]  B_VAL           B value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[out] src             SRC tile
+ * @param[out] dst             DST tile
+ */
+#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
+    ({                                                                                         \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
+        {                                                                                      \
+            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
+        })                                                                                     \
+    })
+
+
+// NOTE : A_VAL and B_VAL should be quantized values (using same quantization info as x)
+// RELU Activation
+#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_POINT, x))
+// Bounded RELU Activation
+#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_POINT, x)))
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+// Hard Swish Activation
+#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
+// Identity Activation
+#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) (x)
+
+#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x)
+#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_POINT, A_VAL, B_VAL, x)
+
+#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
+#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
+#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
+#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
+
+/** Element-wise activation for quantized types
+ *
+ * @note Performs: activation(LHS) = DST
+ *
+ * @param[in]  DATA_TYPE       SRC/DST data type
+ * @param[in]  M0              Number of SRC/DST rows
+ * @param[in]  N0              Number of SRC/DST columns
+ * @param[in]  ACTIVATION_TYPE Activation type
+ * @param[in]  ZERO_POINT      The zero value to consider in the computation
+ * @param[in]  A_VAL           Quantized A value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[in]  B_VAL           Quantized B value used for the activation (e.g. tanh_op, brelu,..)
+ * @param[out] src             SRC tile
+ * @param[out] dst             DST tile
+ */
+#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, src, dst)               \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
+        { \
+            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_POINT, A_VAL, B_VAL, src[_m0].v); \
+        })                                                                                          \
+    })
+
+/** Element-wise addition between two tiles
+ *
+ * @note Performs: LHS + RHS = DST
+ *
+ * @param[in]  DATA_TYPE LHS/RHS/DST data type
+ * @param[in]  M0        Number of LHS rows
+ * @param[in]  N0        Number of LHS columns
+ * @param[in]  lhs       LHS tile
+ * @param[in]  rhs       Constant RHS tile
+ * @param[out] dst       DST tile
+ */
+#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                            \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
+        {                                                         \
+            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
+        })                                                        \
+    })
+
+/** Element-wise addition with a constant value
+ *
+ * @note Performs: LHS + constant = DST
+ *
+ * @param[in]  DATA_TYPE    LHS/RHS/DST data type
+ * @param[in]  M0           Number of LHS rows
+ * @param[in]  N0           Number of LHS columns
+ * @param[in]  lhs          LHS tile
+ * @param[in]  rhs_constant Constant value
+ * @param[out] dst          DST tile
+ */
+#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
+    ({                                                            \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
+        {                                                         \
+            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
+        })                                                        \
+    })
+
+#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise scale with a constant value
+ *
+ * @note Performs: LHS * constant = DST
+ *
+ * @param[in]  DATA_TYPE    LHS/RHS/DST data type
+ * @param[in]  M0           Number of LHS rows
+ * @param[in]  N0           Number of LHS columns
+ * @param[in]  lhs          LHS tile
+ * @param[in]  rhs_constant Constant value
+ * @param[out] dst          DST tile
+ */
+#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
+    ({                                                            \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
+        {                                                         \
+            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
+        })                                                        \
+    })
+
+/** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
+ *
+ * @note Performs: LHS OP RHS[broadcasted] = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of LHS rows
+ * @param[in]  N0            Number of LHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Element-wise operation with LHS broadcasted (LHS has the X dimension only)
+ *
+ * @note Performs: LHS[broadcasted] OP RHS = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of RHS rows
+ * @param[in]  N0            Number of RHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation between two tiles (LHS and RHS)
+ *
+ * @note Performs: LHS OP RHS = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of LHS rows
+ * @param[in]  N0            Number of LHS columns
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Floor operation on a tile
+ *
+ * @note Performs: floor(SRC) = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of SRC rows
+ * @param[in]  N0            Number of SRC columns
+ * @param[in]  src           LHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Matrix multiplication
+ *
+ * @note Performs: LHS X RHS + DST = DST
+ *
+ * @param[in]      LHS_DATA_TYPE LHS tile data type
+ * @param[in]      RHS_DATA_TYPE RHS tile data type
+ * @param[in]      DST_DATA_TYPE RHS tile data type
+ * @param[in]      M0            Number of LHS rows
+ * @param[in]      N0            Number of RHS columns
+ * @param[in]      K0            Number of LHS columns
+ * @param[in]      LHS_LAYOUT    LHS layout (T= transposed, NT= not transposed)
+ * @param[in]      RHS_LAYOUT    RHS layout (T= transposed, NT= not transposed)
+ * @param[in]      lhs           LHS tile
+ * @param[in]      rhs           RHS tile
+ * @param[in, out] dst           DST tile
+ *
+ * @note For Int8/UInt8 multiplications, we only have T_MMUL_NT_T because we need
+ *       the multiply the rows of Lhs and Rhs tensors to utilize dot product extension.
+ *       Addition of other versions requires dealing with on the fly transposition of
+ *       these tile elements and therefore is not favored.
+ */
+#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
+                })                                                                        \
+            })                                                                            \
+        })                                                                                \
+    }
+
+#define T_MMUL_NT_NT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                                                    \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                                                \
+        {                                                                                                                \
+            LOOP_UNROLLING(int, _k, 0, 1, K0,                                                                            \
+            {                                                                                                            \
+                dst[_m].v = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (rhs[_k].v), dst[_m].v);                                 \
+            })                                                                                                           \
+        })                                                                                                               \
+    }
+
+#define T_MMUL_T_NT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_k].s[_m]), (DST_DATA_TYPE)(rhs[_k].s[_n]), dst[_m].s[_n]); \
+                })                                                                        \
+            })                                                                            \
+        })                                                                                \
+    }
+
+#define T_MMUL_T_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_T_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_k].s[_m]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
+                })                                                                        \
+            })                                                                            \
+        })                                                                                \
+    }
+
+#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
+    ({ \
+        LOOP_UNROLLING(int, _m, 0, 1, M0, \
+        { \
+            LOOP_UNROLLING(int, _n, 0, 1, N0, \
+            { \
+                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
+            })                                                                                             \
+        })                                                                                             \
+    })
+
+#endif /* ACL_SRC_CORE_CL_CL_KERNELS_TILE_HELPERS */
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
deleted file mode 100644
index 785be6c710..0000000000
--- a/src/core/CL/cl_kernels/transpose.cl
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define SWAP_ROW(u0, l0)     \
-    ({                       \
-        tmp_swap = u0;       \
-        u0       = l0;       \
-        l0       = tmp_swap; \
-    })
-
-#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
-    ({                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 4)              \
-        tmp_swap;                                \
-        SWAP_ROW(u0, l0);                        \
-        SWAP_ROW(u1, l1);                        \
-        SWAP_ROW(u2, l2);                        \
-        SWAP_ROW(u3, l3);                        \
-    })
-
-#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \
-    ({                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                              \
-        tmp_swap;                                                                \
-        SWAP_ROW(u0, l0);                                                        \
-        SWAP_ROW(u1, l1);                                                        \
-        SWAP_ROW(u2, l2);                                                        \
-        SWAP_ROW(u3, l3);                                                        \
-        SWAP_ROW(u4, l4);                                                        \
-        SWAP_ROW(u5, l5);                                                        \
-        SWAP_ROW(u6, l6);                                                        \
-        SWAP_ROW(u7, l7);                                                        \
-    })
-
-#define TRANSPOSE_4x4(u0, u1, u2, u3) \
-    ({                                \
-        VEC_DATA_TYPE(DATA_TYPE, 4)   \
-        tmp;                          \
-        tmp.s012 = u0.s123;           \
-        u0.s1    = u1.s0;             \
-        u0.s2    = u2.s0;             \
-        u0.s3    = u3.s0;             \
-        u1.s0    = tmp.s0;            \
-        u2.s0    = tmp.s1;            \
-        u3.s0    = tmp.s2;            \
-        \
-        tmp.s01 = u1.s23;             \
-        u1.s2   = u2.s1;              \
-        u1.s3   = u3.s1;              \
-        u2.s1   = tmp.s0;             \
-        u3.s1   = tmp.s1;             \
-        \
-        tmp.s0 = u2.s3;               \
-        u2.s3  = u3.s2;               \
-        u3.s2  = tmp.s0;              \
-    })
-
-#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)                                             \
-    ({                                                                                            \
-        TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123);                                    \
-        TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567);                                    \
-        TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123);                                    \
-        TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567);                                    \
-        SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \
-    })
-
-#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)                                                \
-    ({                                                                                                                                       \
-        TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567);       \
-        TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF);       \
-        TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \
-        TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \
-        SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF,             \
-                 u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567);      \
-    })
-
-#ifndef DATA_TYPE_IN_BYTES
-#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
-#endif /* not DATA_TYPE_IN_BYTES */
-
-#undef VLOAD
-#undef VSTORE
-
-#if DATA_TYPE_IN_BYTES == 4
-#define DATA_TYPE uint
-#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3)
-#define VLOAD(x, y) vload4(x, y)
-#define VSTORE(x, y, z) vstore4(x, y, z)
-#define BLOCK_SIZE 4
-#elif DATA_TYPE_IN_BYTES == 2
-#define DATA_TYPE ushort
-#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)
-#define VLOAD(x, y) vload8(x, y)
-#define VSTORE(x, y, z) vstore8(x, y, z)
-#define BLOCK_SIZE 8
-#elif DATA_TYPE_IN_BYTES == 1
-#define DATA_TYPE uchar
-#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)
-#define VLOAD(x, y) vload16(x, y)
-#define VSTORE(x, y, z) vstore16(x, y, z)
-#define BLOCK_SIZE 16
-#else /* switch DATA_TYPE_IN_BYTES */
-#error DATA_TYPE_IN_BYTES not supported for transpose
-#endif /* switch DATA_TYPE_IN_BYTES */
-
-/** This OpenCL kernel computes the matrix transposition of input matrix
- *
- * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
- *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
- *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
- *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void transpose(IMAGE_DECLARATION(src),
-                        IMAGE_DECLARATION(dst))
-{
-    uint x = get_global_id(0) * BLOCK_SIZE;
-    uint y = get_global_id(1) * BLOCK_SIZE;
-
-    // Compute source address
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Load the NxN block at (x, y)
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
-#if BLOCK_SIZE > 4
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7)));
-#if BLOCK_SIZE == 16
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14)));
-    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
-    u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15)));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
-
-    // Transpose the block
-    TRANSPOSE();
-
-    // Store the block at (y, x)
-    uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes;
-    VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y));
-    VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y));
-    VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y));
-    VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y));
-#if BLOCK_SIZE > 4
-    VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y));
-    VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y));
-    VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y));
-    VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y));
-#if BLOCK_SIZE == 16
-    VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y));
-    VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y));
-    VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y));
-    VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y));
-    VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y));
-    VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y));
-    VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y));
-    VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
-}
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
deleted file mode 100644
index 909b92055b..0000000000
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns a vector of floats contaning the matrix coefficients. */
-inline const float8 build_affine_mtx()
-{
-    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
-}
-
-/** Transforms 4 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   affine matrix
- *
- * @return a int8 containing 4 2D transformed values.
- */
-inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   output(x,y) = input(x0,y0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_affine_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 64828259d0..6595bd1981 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016, 2017, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,14 +31,15 @@
  * @param[in] border_size Border size of the image
  *
  */
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
 {
     const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
     const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+                    clamped_y.s3);
 }
 
-/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
 /** Clamps the given coordinates to the borders.
  *
  * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
@@ -64,12 +65,6 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
                                          *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
 }
 
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
 /** Given a texel coordinates this function will return the following array of coordinates:
  * [ P, right neighbour, below neighbour, below right neighbour ]
  *
@@ -81,7 +76,8 @@ inline const float2 get_current_coords()
  */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+                    /*br*/ coord.s0 + 1, coord.s1 + 1);
 }
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -92,41 +88,41 @@ inline const float8 get_neighbour_coords(const float2 coord)
  * @param[in] height      Height of the image
  * @param[in] border_size Border size
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+    const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+    const float16 c1 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
 
     // Loads the values from the input image
     const float16 t = (float16)(
-                          /* tl, tr, bl, br */
-                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
-                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
-                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
-                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+        /* tl, tr, bl, br */
+        *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a = coords - fc;
+    const float8 b = ((float8)(1.f)) - a;
+    const float4 fr =
+        (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                 ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                 ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                 ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
-/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
  *
  * @param[in] in     Pointer to the source image.
@@ -134,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const
  * @param[in] width  Width of the image
  * @param[in] height Height of the image
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+    bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
 {
     return bilinear_interpolate_with_border(in, coords, width, height, 1);
 }
diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
deleted file mode 100644
index ca21be6765..0000000000
--- a/src/core/CL/cl_kernels/warp_helpers_quantized.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Clamps the given coordinates to the borders according to the border size.
- *
- * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width       Width of the image
- * @param[in] height      Height of the image
- * @param[in] border_size Border size of the image
- *
- */
-inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
-{
-    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
-    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
-}
-
-/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
-/** Clamps the given coordinates to the borders.
- *
- * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width  Width of the image
- * @param[in] height Height of the image
- *
- */
-inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
-{
-    return clamp_to_border_with_size_quantized(coords, width, height, 1);
-}
-
-/** Given a texel coordinates this function will return the following array of coordinates:
- * [ P, right neighbour, below neighbour, below right neighbour ]
- *
- * @note No checks to see if the coordinates are out of the image are done here.
- *
- * @param[in] coord Input coordinates
- *
- * @return vector of 8 floats with the coordinates, even positions are x and odd y.
- */
-inline const float8 get_neighbour_coords_quantized(const float2 coord)
-{
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
-}
-
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords_quantized()
-{
-    return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] border_size   Border size
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
-                                                                                    const float scale, const int offset_qasymm)
-{
-    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
-
-    // Sets the 4x4 coordinates for each of the four input texels
-    const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
-
-    // Loads the values from the input image
-    const int16 t = (int16)(
-                        /* tl, tr, bl, br */
-                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-
-    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
-
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
-                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
-                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
-                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
-
-    const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
-
-    return res;
-}
-
-/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in            Pointer to the source image.
- * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width         Width of the image
- * @param[in] height        Height of the image
- * @param[in] scale         Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
-{
-    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
-}
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
deleted file mode 100644
index bed78388a4..0000000000
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Returns the perspective matrix */
-inline const float16 build_perspective_mtx()
-{
-    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
-}
-
-/** Transforms four 2D coordinates using the formula:
- *
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   (x0/z0,y0/z0)
- *
- * @param[in] coord 2D coordinate to transform.
- * @param[in] mtx   perspective matrix
- *
- * @return a vector float8 containing four 2D transformed values.
- */
-inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    // transform [z,z+1,z+2,z+3]
-    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
-    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
-    // transform [x,x+1,x+2,x+3]
-    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
-    // transform [y,y+1,y+2,y+3]
-    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-}
-
-/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
- *
- * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
- *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
- *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
- *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
- *
- *   output(x,y) = input(x0/z0,y0/z0)
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_nearest_neighbour(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
-}
-
-/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
- *
- * @attention The matrix coefficients need to be passed at compile time:\n
- * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
- * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in]  width                             Width of the destination image
- * @param[in]  height                            Height of the destination image
- */
-__kernel void warp_perspective_bilinear(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const int width,
-    const int height)
-{
-    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
deleted file mode 100644
index 5e5b737785..0000000000
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ /dev/null
@@ -1,2753 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond)                                         \
-    ({                                                                                                              \
-        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
-        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
-        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
-        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
-        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \
-        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \
-    })
-
-#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond)                                         \
-    ({                                                                                                              \
-        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
-        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
-        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
-        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
-        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \
-        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \
-    })
-
-#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond)                                         \
-    ({                                                                                                              \
-        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
-        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
-        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
-        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
-        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \
-        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \
-        basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \
-        basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \
-    })
-
-#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond)                                         \
-    ({                                                                                                              \
-        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
-        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
-        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
-        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
-        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \
-        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \
-        basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \
-        basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \
-    })
-
-#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
-    ({                                                              \
-        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
-        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
-        comm_fact.s2 = 2.5f * tmp.s3;                               \
-        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
-        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
-        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
-        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
-        \
-        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
-        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
-        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
-        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
-        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
-        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
-        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
-        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
-    })
-
-#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
-    ({                                                                                             \
-        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
-        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
-        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
-        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
-        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
-        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
-        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
-        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
-        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
-        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
-        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
-        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
-        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
-        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
-    })
-
-#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int z              = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
-    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
-    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
-    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
-    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
-    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
-    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
-
-    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
-    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
-    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
-    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
-
-    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
-    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
-    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
-    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
-    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2) * 2;
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    src_addr += src_stride_z;
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp4 = in_row4;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-    tmp4 -= in_row6;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp5 = in_row5 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp6 = in_row6 - in_row5;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp7 = in_row5 - in_row7;
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
-    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
-    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
-    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
-    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
-    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
-    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
-    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
-    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
-    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
-    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
-    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
-    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
-    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
-    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out0 = 0.0f;
-    DATA_TYPE out1 = 0.0f;
-    DATA_TYPE out2 = 0.0f;
-    DATA_TYPE out3 = 0.0f;
-    DATA_TYPE out4 = 0.0f;
-    DATA_TYPE out5 = 0.0f;
-
-    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
-    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
-    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
-    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
-    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
-    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
-    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row4
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-
-    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
-    DATA_TYPE k0 = d41.s0;
-    DATA_TYPE k1 = d41.s0;
-    DATA_TYPE k2 = d41.s0;
-    DATA_TYPE k3 = d41.s0;
-    DATA_TYPE k4 = d41.s0;
-    DATA_TYPE k5 = 0.0f;
-
-    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
-    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
-    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
-    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
-    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
-    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
-
-    out0 += k0;
-    out1 += k1;
-    out2 += k2;
-    out3 += k3;
-    out4 += k4;
-    out5 += k5;
-
-    // Row2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-
-    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
-    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
-    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
-    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
-    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
-    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Compute destination address
-#if defined(SRC_DEPTH)
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
-#endif /* defined(SRC_DEPTH) */
-
-    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE out6  = k0;
-    DATA_TYPE out7  = k1;
-    DATA_TYPE out8  = k2;
-    DATA_TYPE out9  = k3;
-    DATA_TYPE out10 = k4;
-    DATA_TYPE out11 = k5;
-    DATA_TYPE out12 = k0;
-    DATA_TYPE out13 = k1;
-    DATA_TYPE out14 = k2;
-    DATA_TYPE out15 = k3;
-    DATA_TYPE out16 = k4;
-    DATA_TYPE out17 = k5;
-    DATA_TYPE out18 = k0;
-    DATA_TYPE out19 = k1;
-    DATA_TYPE out20 = k2;
-    DATA_TYPE out21 = k3;
-    DATA_TYPE out22 = k4;
-    DATA_TYPE out23 = k5;
-    DATA_TYPE out24 = k0;
-    DATA_TYPE out25 = k1;
-    DATA_TYPE out26 = k2;
-    DATA_TYPE out27 = k3;
-    DATA_TYPE out28 = k4;
-    DATA_TYPE out29 = k5;
-
-    // Row1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-
-    // Row3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-
-    // Compute common parts for the channels between [6, 29]
-    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
-    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
-    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
-    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
-    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
-    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
-    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
-    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
-    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
-
-    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
-    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
-    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
-    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
-    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
-    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
-    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
-    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
-    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
-    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
-
-    out6 += part0 - part1;
-    out12 += part0 + part1;
-    out7 += part2 + part3 + part4 + part5;
-    out8 += part2 - part3 + part4 - part5;
-    out13 += part2 + part3 - part4 - part5;
-    out14 += part2 - part3 - part4 + part5;
-    out9 += part6 + part7 + part8 + part9;
-    out10 += part6 - part7 + part8 - part9;
-    out15 += part6 - part7 - part8 + part9;
-    out16 += part6 + part7 - part8 - part9;
-    out11 += part10 + part11;
-    out17 += part10 - part11;
-
-    out18 += part13 - part12;
-    out24 += part13 + part12;
-    out19 += part14 + part15 + part16 + part17;
-    out20 += part14 - part15 + part16 - part17;
-    out25 += part14 - part15 - part16 + part17;
-    out26 += part14 + part15 - part16 - part17;
-    out21 += part18 + part19 + part20 + part21;
-    out22 += part18 - part19 + part20 - part21;
-    out27 += part18 - part19 - part20 + part21;
-    out28 += part18 + part19 - part20 - part21;
-    out23 += part22 + part23;
-    out29 += part22 - part23;
-
-    *(dst_addr) = out6;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out7;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out8;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out9;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out10;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out11;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out12;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out13;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out14;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out15;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out16;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out17;
-    dst_addr += dst_plane_stride;
-
-    *(dst_addr) = out18;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out19;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out20;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out21;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out22;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out23;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out24;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out25;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out26;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out27;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out28;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out29;
-    dst_addr += dst_plane_stride;
-
-    // Row5
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-
-    // Channels [30, 35]
-    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int                                z = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-    // Load input tile
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = 0.0f;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;
-    tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
-
-    comm_fact0 = (DATA_TYPE)2.5 * in_row3;
-    comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
-
-    comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;
-    comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate output rows (reuse comm_fact0 vector)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1, out2, out3, out4, out5, out6, out7;
-
-    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // Index channel
-    const int x = get_global_id(0);
-    // Index width
-    const int y = get_global_id(1);
-#if defined(NUM_TILES_Y)
-    // Index height
-    const int z = get_global_id(2) % NUM_TILES_Y;
-    // Index batch size
-    const int b = get_global_id(2) / NUM_TILES_Y;
-#else  // defined(NUM_TILES_Y)
-    // Index height
-    const int z              = get_global_id(2);
-#endif // defined(NUM_TILES_Y)
-
-#if defined(NUM_TILES_Y)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
-#else  // defined(NUM_TILES_Y)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
-#endif // defined(NUM_TILES_Y)
-
-    // Origin coordinates for the width (y) and height (z) in the input tensor
-    int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
-    int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
-    int4 z_coord0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
-    int2 z_coord1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
-
-    // Coordinates to use to avoid out-of-bound reads
-    int4 y_coord_valid0 = clamp(y_coord0, (int4)0, (int4)((int)SRC_DIM_1 - 1));
-    int2 y_coord_valid1 = clamp(y_coord1, (int2)0, (int2)((int)SRC_DIM_1 - 1));
-    int4 z_coord_valid0 = clamp(z_coord0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
-    int2 z_coord_valid1 = clamp(z_coord1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
-
-    // Boundary conditions
-    int4 y_cond0 = y_coord_valid0 == y_coord0;
-    int2 y_cond1 = y_coord_valid1 == y_coord1;
-    int4 z_cond0 = z_coord_valid0 == z_coord0;
-    int2 z_cond1 = z_coord_valid1 == z_coord1;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d4, y_cond, z_cond1.s0);
-
-    DATA_TYPE k0 = d44;
-    DATA_TYPE k1 = d44;
-    DATA_TYPE k2 = d44;
-    DATA_TYPE k3 = d44;
-    DATA_TYPE k4 = d44;
-    DATA_TYPE k5 = (DATA_TYPE)0.0f;
-
-    k0 += 4.0f * d40 - 5.0f * d42;
-    k1 += -4.0f * d41 - 4.0f * d42 + d43;
-    k2 += 4.0f * d41 - 4.0f * d42 - d43;
-    k3 += -2.0f * d41 + 2.0f * d43 - d42;
-    k4 += 2.0f * d41 - 2.0f * d43 - d42;
-    k5 += 4.0f * d41 - 5.0f * d43 + d45;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d0, y_cond, z_cond0.s0);
-
-#else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE d00            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    DATA_TYPE d01            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d02            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d03            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d04            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
-    DATA_TYPE d05            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(DATA_TYPE, d0, y_cond0.s0, z_cond);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
-    DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
-    DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
-    DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
-    DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
-    DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d2, y_cond, z_cond0.s2);
-
-    out0 += k0;
-    out1 += k1;
-    out2 += k2;
-    out3 += k3;
-    out4 += k4;
-    out5 += k5;
-    DATA_TYPE out6  = k0;
-    DATA_TYPE out7  = k1;
-    DATA_TYPE out8  = k2;
-    DATA_TYPE out9  = k3;
-    DATA_TYPE out10 = k4;
-    DATA_TYPE out11 = k5;
-    DATA_TYPE out12 = k0;
-    DATA_TYPE out13 = k1;
-    DATA_TYPE out14 = k2;
-    DATA_TYPE out15 = k3;
-    DATA_TYPE out16 = k4;
-    DATA_TYPE out17 = k5;
-    DATA_TYPE out18 = k0;
-    DATA_TYPE out19 = k1;
-    DATA_TYPE out20 = k2;
-    DATA_TYPE out21 = k3;
-    DATA_TYPE out22 = k4;
-    DATA_TYPE out23 = k5;
-    DATA_TYPE out24 = k0;
-    DATA_TYPE out25 = k1;
-    DATA_TYPE out26 = k2;
-    DATA_TYPE out27 = k3;
-    DATA_TYPE out28 = k4;
-    DATA_TYPE out29 = k5;
-
-    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
-    out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
-    out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;
-    out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;
-    out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;
-    out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;
-    out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Compute destination address
-#if defined(NUM_TILES_Y)
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-#else  // defined(NUM_TILES_Y)
-    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
-#endif // defined(NUM_TILES_Y)
-
-    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
-
-    *((__global DATA_TYPE *)dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-
-    DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d1, y_cond, z_cond0.s1);
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d3, y_cond, z_cond0.s3);
-
-    // Compute common parts for the channels between [6, 29]
-    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
-    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    DATA_TYPE part0  = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
-    DATA_TYPE part1  = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
-    DATA_TYPE part2  = 16.0f * d22 - 4.0f * d24;
-    DATA_TYPE part3  = 16.0f * d21 - 4.0f * d23;
-    DATA_TYPE part4  = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
-    DATA_TYPE part5  = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
-    DATA_TYPE part6  = 4.0f * d22 - 4.0f * d24;
-    DATA_TYPE part7  = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
-    DATA_TYPE part8  = 4.0f * d12 - 4.0f * d14 - d32 + d34;
-    DATA_TYPE part9  = 8.0f * d21 - 8.0f * d23;
-    DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
-    DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
-
-    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
-    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
-    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
-    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24
-    DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
-    DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
-    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23
-    DATA_TYPE part18 = part6 * 0.25f; // d22 - d24
-    DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
-    DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
-    DATA_TYPE part21 = part9 * 0.25f;                                        // 2.0f * (d21 - d23)
-    DATA_TYPE part22 = part10 * 0.25f;                                       // - 4.0f * d21 + 5.0f * d23 - d25
-    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
-
-    out6 += part0 - part1;
-    out12 += part0 + part1;
-    out7 += part2 + part3 + part4 + part5;
-    out8 += part2 - part3 + part4 - part5;
-    out13 += part2 + part3 - part4 - part5;
-    out14 += part2 - part3 - part4 + part5;
-    out9 += part6 + part7 + part8 + part9;
-    out10 += part6 - part7 + part8 - part9;
-    out15 += part6 - part7 - part8 + part9;
-    out16 += part6 + part7 - part8 - part9;
-    out11 += part10 + part11;
-    out17 += part10 - part11;
-
-    out18 += part13 - part12;
-    out24 += part13 + part12;
-    out19 += part14 + part15 + part16 + part17;
-    out20 += part14 - part15 + part16 - part17;
-    out25 += part14 - part15 - part16 + part17;
-    out26 += part14 + part15 - part16 - part17;
-    out21 += part18 + part19 + part20 + part21;
-    out22 += part18 - part19 + part20 - part21;
-    out27 += part18 - part19 - part20 + part21;
-    out28 += part18 + part19 - part20 - part21;
-    out23 += part22 + part23;
-    out29 += part22 - part23;
-
-    *((__global DATA_TYPE *)dst_addr) = out6;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out7;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out8;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out9;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out10;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out11;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out12;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out13;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out14;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out15;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out16;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out17;
-    dst_addr += dst_plane_stride;
-
-    *((__global DATA_TYPE *)dst_addr) = out18;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out19;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out20;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out21;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out22;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out23;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out24;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out25;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out26;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out27;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out28;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out29;
-    dst_addr += dst_plane_stride;
-
-    // Row5
-    DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-    DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-    DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-    DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-    DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-    DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d5, y_cond, z_cond1.s1);
-
-    // Channels [30, 35]
-    out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
-    out1 = -16.0f * d11 - 16.0f * d12 + 4.0f * d13 + 20.0f * d31 + 20.0f * d32 - 5.0f * d33 - 4.0f * d51 - 4.0f * d52 + d53 + d54 + 4.0f * d14 - 5.0f * d34;
-    out2 = 16.0f * d11 - 16.0f * d12 - 4.0f * d13 - 20.0f * d31 + 20.0f * d32 + 5.0f * d33 + 4.0f * d51 - 4.0f * d52 - d53 + d54 + 4.0f * d14 - 5.0f * d34;
-    out3 = -8.0f * d11 - 4.0f * d12 + 8.0f * d13 + 10.0f * d31 - 10.0f * d33 + 5.0f * d32 - 2.0f * d51 + 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
-    out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
-    out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
-
-    *((__global DATA_TYPE *)dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *((__global DATA_TYPE *)dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(NUM_TILES_Y)
-    const int z = get_global_id(2) % NUM_TILES_Y;
-    const int b = get_global_id(2) / NUM_TILES_Y;
-#else  // defined(NUM_TILES_Y)
-    const int                                z = get_global_id(2);
-#endif // defined(NUM_TILES_Y)
-
-    // Compute input address
-#if defined(NUM_TILES_Y)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
-#else  // defined(NUM_TILES_Y)
-    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
-#endif // defined(NUM_TILES_Y)
-
-    // Origin coordinates for the width (y) and height (z) in the input tensor
-    int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
-
-    // Coordinates to use to avoid out-of-bound reads
-    int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
-    int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
-
-    // Boundary conditions
-    int8 y_cond0 = y_coord_valid0 == y_coord0;
-    int8 z_cond0 = z_coord_valid0 == z_coord0;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    // Load the input tile
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = 0.0f;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = in_row0;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    // Load the input tile
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = 0.0f;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = in_row0;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-#else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
-
-    // Row0
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
-
-    // Row1
-    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
-
-    // Row2
-    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
-
-    // Row3
-    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
-
-    // Row4
-    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
-
-    // Row5
-    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
-
-    // Row6
-    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
-
-    // Row7
-    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
-
-    // Calculate intermediate tensor and reuse common factor vectors
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
-
-    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
-    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
-
-    comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
-    comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
-
-    // Calculate output rows (reuse comm_fact0 vector)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0, out1, out2, out3, out4, out5, out6, out7;
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-#if defined(NUM_TILES_Y)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* NUM_TILES_Y */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* NUM_TILES_Y */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(NUM_TILES_Y)
-    const int z = get_global_id(2) % NUM_TILES_Y;
-    const int b = get_global_id(2) / NUM_TILES_Y;
-#else  /* defined(NUM_TILES_Y) */
-    const int       z        = get_global_id(2);
-#endif /* defined(NUM_TILES_Y) */
-
-    // Compute input address
-#if defined(NUM_TILES_Y)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
-#else  /* defined(NUM_TILES_Y) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
-#endif /* defined(NUM_TILES_Y) */
-
-    // Origin coordinates for the width (y) and height (z) in the input tensor
-    int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
-
-    // Coordinates to use to avoid out-of-bound reads
-    int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
-    int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
-
-    // Boundary conditions
-    int8 y_cond0 = y_coord_valid0 == y_coord0;
-    int8 z_cond0 = z_coord_valid0 == z_coord0;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    // Load the input tile
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
-
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    // Load the input tile
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
-
-    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
-#else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
-
-    // Row0
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
-
-    // Row1
-    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
-
-    // Row2
-    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
-
-    // Row3
-    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
-
-    // Row4
-    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
-
-    // Row5
-    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
-
-    // Row6
-    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
-
-    // Row7
-    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
-
-    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
-
-    // Calculate intermediate tensors
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0, out1, out2, out3, out4, out5, out6, out7;
-
-    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
-
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-#if defined(NUM_TILES_Y)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* NUM_TILES_Y */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* NUM_TILES_Y */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
deleted file mode 100644
index 0a7b5f50b2..0000000000
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ /dev/null
@@ -1,2266 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#include "activation_float_helpers.h"
-
-#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute the 2x1 or 1x2 output tile
-    // out00 = d00 + d01 + d02
-    // out01 = d01 - d02 - d03
-
-    float out00 = d00 + d01 + d02;
-    float out01 = d01 - d02 - d03;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    // Compute the 2x2 output tile
-    float k0 = d01 + d11 + d21;
-    float k1 = d02 + d12 + d22;
-    float k2 = d11 - d21 - d31;
-    float k3 = d12 - d22 - d32;
-
-    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
-    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
-    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
-    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
-
-    float out00 = d10;
-    float out01 = -d13;
-    float out10 = d10;
-    float out11 = -d13;
-
-    out00 += d00 + d20 + k0 + k1;
-    out01 += k0 - k1 - (d03 + d23);
-    out10 += -d20 - d30 + k2 + k3;
-    out11 += k2 - k3 + d23 + d33;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (DATA_TYPE)b;
-    out11 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
-    ({                                                               \
-        col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6;                   \
-        col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
-    })
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_7x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    int y_in  = get_global_id(1);
-    int x_out = get_global_id(0);
-    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
-
-#if defined(SRC_DEPTH)
-    dst_base_ptr += batch * dst_stride_w;
-#endif // defined(SRC_DEPTH)
-
-    // Load the values across the channels to compose the input tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
-    float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    dst_base_ptr += y_out * dst_stride_y;
-
-    int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s1) = out0_dt.s1;
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s0) = out0_dt.s0;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    dst_base_ptr += z_out * dst_stride_z;
-
-    int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,
-                         B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1) = out0_dt.s1;
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0) = out0_dt.s0;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
-    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
-    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
-    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
-    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
-
-    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
-    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
-    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
-    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
-    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
-    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
-    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
-    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
-
-    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
-    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
-    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
-    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
-    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
-    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
-    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
-    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
-
-    // Compute the 8x2 intermediate tensor
-    VEC_DATA_TYPE(float, 2)
-    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
-
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
-    COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
-
-    // Compute the 2x2 output tile
-    VEC_DATA_TYPE(float, 2)
-    out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
-    VEC_DATA_TYPE(float, 2)
-    out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
-
-    out_col0 += (VEC_DATA_TYPE(float, 2))b;
-    out_col1 += (VEC_DATA_TYPE(float, 2))b;
-
-#endif // defined(HAS_BIAS)
-
-    int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
-    int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
-
-    // Store the output tile
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1) = out_col1_dt.s1;
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0) = out_col1_dt.s0;
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1) = out_col0_dt.s1;
-    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0) = out_col0_dt.s0;
-
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src                                                                   = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr                                                       = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the 6x6 or 6x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-
-    // Compute out00, out01, out02 and out03
-    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
-
-    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
-    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
-
-    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
-    out01 += k1 - d02 - d12 - d22 - d32 - d42;
-    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
-    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
-
-    // Compute out10, out11, out12 and out13
-    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-
-    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
-
-    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
-    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
-    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
-    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
-
-    // Compute out20, out21, out22 and out23
-    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-
-    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
-
-    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
-    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
-    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
-    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
-
-    // Compute out30, out31, out32 and out33
-    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-
-    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
-
-    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
-    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
-    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
-    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                         B_VAL);
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (float)b;
-    out11 += (float)b;
-    out12 += (float)b;
-    out13 += (float)b;
-
-    out20 += (float)b;
-    out21 += (float)b;
-    out22 += (float)b;
-    out23 += (float)b;
-
-    out30 += (float)b;
-    out31 += (float)b;
-    out32 += (float)b;
-    out33 += (float)b;
-#endif // defined(HAS_BIAS)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
- */
-__kernel void winograd_output_transform_4x4_3x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-
-    // Compute out00, out01, out02 and out03
-    float out00 = d01 + d21 + d41 + d11 + d31;
-    float out01 = d01 + d21 + d41 + d11 + d31;
-    float out02 = d01 + d21 + d41 + d11 + d31;
-    float out03 = d01 + d21 + d41 + d11 + d31;
-
-    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
-    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
-
-    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
-    out01 += k1 - d02 - d12 - d22 - d32 - d42;
-    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
-    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
-
-    // Compute out10, out11, out12 and out13
-    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-
-    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
-
-    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
-    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
-    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
-    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
-
-    // Compute out20, out21, out22 and out23
-    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-
-    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
-
-    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
-    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
-    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
-    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
-
-    // Compute out30, out31, out32 and out33
-    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-
-    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
-
-    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
-    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
-    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
-    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = get_global_id(0);
-    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
-
-    out00 += (DATA_TYPE)b;
-    out01 += (DATA_TYPE)b;
-    out02 += (DATA_TYPE)b;
-    out03 += (DATA_TYPE)b;
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    out10 += (DATA_TYPE)b;
-    out11 += (DATA_TYPE)b;
-    out12 += (DATA_TYPE)b;
-    out13 += (DATA_TYPE)b;
-
-    out20 += (DATA_TYPE)b;
-    out21 += (DATA_TYPE)b;
-    out22 += (DATA_TYPE)b;
-    out23 += (DATA_TYPE)b;
-
-    out30 += (DATA_TYPE)b;
-    out31 += (DATA_TYPE)b;
-    out32 += (DATA_TYPE)b;
-    out33 += (DATA_TYPE)b;
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#endif // defined(HAS_BIAS)
-
-    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
-
-#if defined(SRC_DEPTH)
-    dst_base_ptr += batch * dst_stride_w;
-#endif // defined(SRC_DEPTH)
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    dst_base_ptr += y_out * dst_stride_y;
-
-    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
-
-    // Store the 1x4 output tile
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                         B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;
-
-#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-
-    dst_base_ptr += z_out * dst_stride_z;
-
-    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
-                         A_VAL, B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-
-    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
-    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
-
-    // Store the 4x4 output tile
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
-                                                                       VEC_DATA_TYPE(DATA_TYPE, 4)),
-                         A_VAL, B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out3_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out3_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out3_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out3_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out2_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out2_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out2_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out2_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out1_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out1_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out1_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out1_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out0_dt.s0;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-}
-
-#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
-    ({                                                                   \
-        comm_fact.s0 = d1 + d2;                                          \
-        comm_fact.s1 = d3 + d4;                                          \
-        comm_fact.s2 = d5 + d6;                                          \
-        \
-        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
-        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
-        \
-        comm_fact.s0 = d1 - d2;                                          \
-        comm_fact.s1 = d3 - d4;                                          \
-        comm_fact.s2 = d5 - d6;                                          \
-        \
-        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
-        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
-    })
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute output address
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the input tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (DATA_TYPE)b;
-    out01 += (DATA_TYPE)b;
-    out02 += (DATA_TYPE)b;
-    out03 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                         B_VAL);
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
-    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
-    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
-    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
-    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
-
-    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
-    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
-    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
-    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
-    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
-    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
-    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
-    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
-
-    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
-    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
-    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
-    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
-    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
-    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
-    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
-    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
-
-    // Compute the 8x4 intermediate tensor
-    VEC_DATA_TYPE(float, 4)
-    comm_fact0, comm_fact1, comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
-
-    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
-
-    // Compute the 4x4 output tile
-    comm_fact0 = tmp_col1 + tmp_col2;
-    comm_fact1 = tmp_col3 + tmp_col4;
-    comm_fact2 = tmp_col5 + tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
-    VEC_DATA_TYPE(float, 4)
-    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
-
-    comm_fact0 = tmp_col1 - tmp_col2;
-    comm_fact1 = tmp_col3 - tmp_col4;
-    comm_fact2 = tmp_col5 - tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out_col0 += (VEC_DATA_TYPE(float, 4))b;
-    out_col1 += (VEC_DATA_TYPE(float, 4))b;
-    out_col2 += (VEC_DATA_TYPE(float, 4))b;
-    out_col3 += (VEC_DATA_TYPE(float, 4))b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    int y_in  = get_global_id(1);
-    int x_out = get_global_id(0);
-    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
-
-#if defined(SRC_DEPTH)
-    dst_base_ptr += batch * dst_stride_w;
-#endif // defined(SRC_DEPTH)
-
-    // Load the values across the channels to compose the input tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    dst_base_ptr += y_out * dst_stride_y;
-
-    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                         B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    dst_base_ptr += z_out * dst_stride_z;
-
-    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                         B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
-    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
-    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
-    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
-    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
-
-    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
-    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
-    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
-    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
-    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
-    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
-    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
-    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
-
-    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
-    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
-    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
-    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
-    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
-    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
-    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
-    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
-
-    // Compute the 8x4 intermediate tensor
-    VEC_DATA_TYPE(float, 4)
-    comm_fact0, comm_fact1, comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
-
-    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
-
-    // Compute the output tile
-    comm_fact0 = tmp_col1 + tmp_col2;
-    comm_fact1 = tmp_col3 + tmp_col4;
-    comm_fact2 = tmp_col5 + tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
-    VEC_DATA_TYPE(float, 4)
-    out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
-
-    comm_fact0 = tmp_col1 - tmp_col2;
-    comm_fact1 = tmp_col3 - tmp_col4;
-    comm_fact2 = tmp_col5 - tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
-
-    out_col0 += (VEC_DATA_TYPE(float, 4))b;
-    out_col1 += (VEC_DATA_TYPE(float, 4))b;
-    out_col2 += (VEC_DATA_TYPE(float, 4))b;
-    out_col3 += (VEC_DATA_TYPE(float, 4))b;
-#endif // defined(HAS_BIAS)
-
-    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
-    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
-
-    // Store the output tile
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-
-    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
-    // is overwritten with the valid one
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out_col3_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out_col2_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out_col1_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out_col0_dt.s3;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out_col3_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out_col2_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out_col1_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out_col0_dt.s2;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out_col3_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out_col2_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out_col1_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out_col0_dt.s1;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out_col3_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out_col2_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out_col1_dt.s0;
-    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out_col0_dt.s0;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
deleted file mode 100644
index 9601dddf67..0000000000
--- a/src/core/CL/cl_kernels/yolo_layer.cl
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
-
-#include "activation_float_helpers.h"
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if VEC_SIZE != 1
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-/** This performs a YOLO partial activation function for NCHW data layout
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Activation function should be given as a preprocessor argument using -DACTIVATION_TYPE=name. e.g. -DACTIVATION_TYPE=TANH
- * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void yolo_layer_nchw(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    const int  box_ch_id = get_global_id(2) % (NUM_CLASSES + 5);
-    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
-
-    if(activate)
-    {
-        // Load data
-        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-        data      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, data, A_VAL, B_VAL); // select(1.0f, ACTIVATION_OP(ACTIVATION_TYPE, data), (SELECT_TYPE)activate);
-
-        // Store result
-        VSTORE(VEC_SIZE)
-        (data, 0, (__global DATA_TYPE *)output.ptr);
-    }
-#ifndef IN_PLACE
-    else
-    {
-        // Load data
-        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
-        // Store result
-        VSTORE(VEC_SIZE)
-        (data, 0, (__global DATA_TYPE *)output.ptr);
-    }
-#endif // IN_PLACE
-}
-
-#else // VEC_SIZE != 1
-
-/** This performs a YOLO partial activation function for NCHW data layout
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=1
- * @note Activation function should be given as a preprocessor argument using -DACTIVATION_TYPE=name. e.g. -DACTIVATION_TYPE=TANH
- * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void yolo_layer_nhwc(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    const int  box_ch_id = get_global_id(0) % (NUM_CLASSES + 5);
-    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
-
-    if(activate)
-    {
-        // Load data
-        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
-        data           = select(data, ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, data, A_VAL, B_VAL), (SELECT_TYPE)activate);
-
-        // Store result
-        *((__global DATA_TYPE *)output.ptr) = data;
-    }
-#ifndef IN_PLACE
-    else
-    {
-        // Load data
-        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
-
-        // Store result
-        *((__global DATA_TYPE *)output.ptr) = data;
-    }
-#endif // IN_PLACE
-}
-
-#endif // VEC_SIZE != 1
-#endif // defined(DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)